Index: llvm/trunk/test/CodeGen/AArch64/srem-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/srem-lkk.ll +++ llvm/trunk/test/CodeGen/AArch64/srem-lkk.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @fold_srem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_srem_positive_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #6 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positive_even(i32 %x) { +; CHECK-LABEL: fold_srem_positive_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #36849 +; CHECK-NEXT: movk w8, #15827, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #40 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #1060 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; CHECK-LABEL: fold_srem_negative_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #65445 +; CHECK-NEXT: movk w8, #42330, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #40 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #-723 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; CHECK-LABEL: fold_srem_negative_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #62439 +; CHECK-NEXT: movk w8, #64805, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #40 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #-22981 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #6 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w9, w8, w9, w0 +; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, #63 // =63 +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0xffffffc0 +; CHECK-NEXT: sub w0, w0, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_fold_srem_i32_smax: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2147483647 +; CHECK-NEXT: add w8, w0, w8 +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0x80000000 +; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #58849 +; CHECK-NEXT: movk x8, #48148, lsl #16 +; CHECK-NEXT: movk x8, #33436, lsl #32 +; CHECK-NEXT: movk x8, #21399, lsl #48 +; CHECK-NEXT: smulh x8, x0, x8 +; CHECK-NEXT: asr x9, x8, #5 +; CHECK-NEXT: add x8, x9, x8, lsr #63 +; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: msub x0, x8, x9, x0 +; CHECK-NEXT: ret + %1 = srem i64 %x, 98 + ret i64 %1 +} Index: llvm/trunk/test/CodeGen/AArch64/srem-vector-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/srem-vector-lkk.ll +++ llvm/trunk/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -0,0 +1,324 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; CHECK-LABEL: fold_srem_vec_1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #63421 +; CHECK-NEXT: mov w12, #33437 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: movk w9, #31710, lsl #16 +; CHECK-NEXT: smov w11, v0.h[2] +; CHECK-NEXT: movk w12, #21399, lsl #16 +; CHECK-NEXT: smull x12, w11, w12 +; CHECK-NEXT: smull x9, w8, w9 +; CHECK-NEXT: lsr x13, x12, #63 +; CHECK-NEXT: asr x12, x12, #37 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w12, w12, w13 +; CHECK-NEXT: mov w13, #98 +; CHECK-NEXT: sub w9, w9, w8 +; CHECK-NEXT: msub w11, w12, w13, w11 +; CHECK-NEXT: asr w13, w9, #6 +; CHECK-NEXT: add w9, w13, w9, lsr #31 +; CHECK-NEXT: mov w13, #37253 +; CHECK-NEXT: mov w10, #-124 +; CHECK-NEXT: smov w12, v0.h[0] +; CHECK-NEXT: movk w13, #44150, lsl #16 +; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: smull x10, w12, w13 +; CHECK-NEXT: lsr x10, x10, #32 +; CHECK-NEXT: add w10, w10, w12 +; CHECK-NEXT: asr w13, w10, #6 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: add w10, w13, w10, lsr #31 +; CHECK-NEXT: msub w9, w10, w9, w12 +; CHECK-NEXT: mov w10, #63249 +; CHECK-NEXT: smov w13, v0.h[3] +; CHECK-NEXT: movk w10, #48808, lsl #16 +; CHECK-NEXT: smull x10, w13, w10 +; CHECK-NEXT: lsr x12, x10, #63 +; CHECK-NEXT: asr x10, x10, #40 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: add w10, w10, w12 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #-1003 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: msub w8, w10, w8, w13 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; CHECK-LABEL: fold_srem_vec_2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #37253 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: movk w9, #44150, lsl #16 +; CHECK-NEXT: smov w10, v0.h[0] +; CHECK-NEXT: smull x13, w8, w9 +; CHECK-NEXT: smov w11, v0.h[2] +; CHECK-NEXT: smull x14, w10, w9 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: smull x15, w11, w9 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: add w13, w13, w8 +; CHECK-NEXT: smull x9, w12, w9 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w14, w14, w10 +; CHECK-NEXT: asr w16, w13, #6 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w15, w15, w11 +; CHECK-NEXT: add w13, w16, w13, lsr #31 +; CHECK-NEXT: asr w16, w14, #6 +; CHECK-NEXT: add w9, w9, w12 +; CHECK-NEXT: add w14, w16, w14, lsr #31 +; CHECK-NEXT: asr w16, w15, #6 +; CHECK-NEXT: add w15, w16, w15, lsr #31 +; CHECK-NEXT: asr w16, w9, #6 +; CHECK-NEXT: add w9, w16, w9, lsr #31 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: msub w10, w14, w16, w10 +; CHECK-NEXT: msub w8, w13, w16, w8 +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: msub w11, w15, w16, w11 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: msub w8, w9, w16, w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smov w9, v0.h[1] +; CHECK-NEXT: smov w10, v0.h[0] +; CHECK-NEXT: smull x13, w9, w8 +; CHECK-NEXT: smov w11, v0.h[2] +; CHECK-NEXT: smull x14, w10, w8 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: smull x15, w11, w8 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: add w13, w13, w9 +; CHECK-NEXT: smull x8, w12, w8 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w14, w14, w10 +; CHECK-NEXT: asr w16, w13, #6 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w15, w15, w11 +; CHECK-NEXT: add w13, w16, w13, lsr #31 +; CHECK-NEXT: asr w16, w14, #6 +; CHECK-NEXT: add w8, w8, w12 +; CHECK-NEXT: add w14, w16, w14, lsr #31 +; CHECK-NEXT: asr w16, w15, #6 +; CHECK-NEXT: add w15, w16, w15, lsr #31 +; CHECK-NEXT: asr w16, w8, #6 +; CHECK-NEXT: add w8, w16, w8, lsr #31 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: msub w10, w14, w16, w10 +; CHECK-NEXT: msub w9, w13, w16, w9 +; CHECK-NEXT: fmov s0, w14 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: msub w11, w15, w16, w11 +; CHECK-NEXT: mov v0.h[1], w13 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: msub w12, w8, w16, w12 +; CHECK-NEXT: mov v0.h[2], w15 +; CHECK-NEXT: mov v1.h[2], w11 +; CHECK-NEXT: mov v1.h[3], w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: add w12, w8, #31 // =31 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: mov w11, #37253 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: smov w9, v0.h[0] +; CHECK-NEXT: smov w10, v0.h[3] +; CHECK-NEXT: movk w11, #44150, lsl #16 +; CHECK-NEXT: and w12, w12, #0xffffffe0 +; CHECK-NEXT: sub w8, w8, w12 +; CHECK-NEXT: add w12, w9, #63 // =63 +; CHECK-NEXT: smull x11, w10, w11 +; CHECK-NEXT: cmp w9, #0 // =0 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: csel w12, w12, w9, lt +; CHECK-NEXT: add w11, w11, w10 +; CHECK-NEXT: and w12, w12, #0xffffffc0 +; CHECK-NEXT: sub w9, w9, w12 +; CHECK-NEXT: asr w12, w11, #6 +; CHECK-NEXT: add w11, w12, w11, lsr #31 +; CHECK-NEXT: smov w12, v0.h[2] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: add w9, w12, #7 // =7 +; CHECK-NEXT: cmp w12, #0 // =0 +; CHECK-NEXT: csel w9, w9, w12, lt +; CHECK-NEXT: and w9, w9, #0xfffffff8 +; CHECK-NEXT: sub w9, w12, w9 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #95 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: msub w8, w11, w8, w10 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #17097 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[2] +; CHECK-NEXT: movk w9, #45590, lsl #16 +; CHECK-NEXT: smull x9, w8, w9 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w9, w9, w8 +; CHECK-NEXT: asr w12, w9, #4 +; CHECK-NEXT: add w9, w12, w9, lsr #31 +; CHECK-NEXT: mov w12, #30865 +; CHECK-NEXT: mov w10, #23 +; CHECK-NEXT: smov w11, v0.h[1] +; CHECK-NEXT: movk w12, #51306, lsl #16 +; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: smull x10, w11, w12 +; CHECK-NEXT: lsr x10, x10, #32 +; CHECK-NEXT: add w10, w10, w11 +; CHECK-NEXT: asr w12, w10, #9 +; CHECK-NEXT: mov w9, #654 +; CHECK-NEXT: add w10, w12, w10, lsr #31 +; CHECK-NEXT: msub w9, w10, w9, w11 +; CHECK-NEXT: mov w10, #47143 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: movk w10, #24749, lsl #16 +; CHECK-NEXT: smull x10, w12, w10 +; CHECK-NEXT: lsr x11, x10, #63 +; CHECK-NEXT: asr x10, x10, #43 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: add w10, w10, w11 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: mov w9, #5423 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: msub w8, w10, w9, w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_i16_smax: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w10, #17097 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w9, v0.h[2] +; CHECK-NEXT: movk w10, #45590, lsl #16 +; CHECK-NEXT: smull x10, w9, w10 +; CHECK-NEXT: lsr x10, x10, #32 +; CHECK-NEXT: add w10, w10, w9 +; CHECK-NEXT: asr w12, w10, #4 +; CHECK-NEXT: mov w11, #23 +; CHECK-NEXT: add w10, w12, w10, lsr #31 +; CHECK-NEXT: msub w9, w10, w11, w9 +; CHECK-NEXT: mov w10, #47143 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: movk w10, #24749, lsl #16 +; CHECK-NEXT: smull x10, w12, w10 +; CHECK-NEXT: lsr x11, x10, #63 +; CHECK-NEXT: asr x10, x10, #43 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: add w10, w10, w11 +; CHECK-NEXT: mov w11, #32767 +; CHECK-NEXT: add w11, w8, w11 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: and w11, w11, #0xffff8000 +; CHECK-NEXT: sub w8, w8, w11 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #5423 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: msub w8, w10, w8, w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #6055 +; CHECK-NEXT: movk x9, #58853, lsl #16 +; CHECK-NEXT: movk x9, #47142, lsl #32 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: movk x9, #24749, lsl #48 +; CHECK-NEXT: smulh x9, x8, x9 +; CHECK-NEXT: asr x12, x9, #11 +; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: add x9, x12, x9, lsr #63 +; CHECK-NEXT: msub x8, x9, x10, x8 +; CHECK-NEXT: mov x9, #21445 +; CHECK-NEXT: movk x9, #1603, lsl #16 +; CHECK-NEXT: movk x9, #15432, lsl #32 +; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: movk x9, #25653, lsl #48 +; CHECK-NEXT: smulh x9, x12, x9 +; CHECK-NEXT: asr x10, x9, #8 +; CHECK-NEXT: add x9, x10, x9, lsr #63 +; CHECK-NEXT: mov w10, #654 +; CHECK-NEXT: msub x9, x9, x10, x12 +; CHECK-NEXT: mov x10, #8549 +; CHECK-NEXT: movk x10, #22795, lsl #16 +; CHECK-NEXT: movk x10, #17096, lsl #32 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: movk x10, #45590, lsl #48 +; CHECK-NEXT: smulh x10, x11, x10 +; CHECK-NEXT: add x10, x10, x11 +; CHECK-NEXT: asr x12, x10, #4 +; CHECK-NEXT: add x10, x12, x10, lsr #63 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: msub x10, x10, x12, x11 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: fmov d1, x10 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.d[1], x9 +; CHECK-NEXT: ret + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/trunk/test/CodeGen/AArch64/urem-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/urem-lkk.ll +++ llvm/trunk/test/CodeGen/AArch64/urem-lkk.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @fold_urem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_urem_positive_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w9, w0, w8 +; CHECK-NEXT: add w8, w8, w9, lsr #1 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positive_even(i32 %x) { +; CHECK-LABEL: fold_urem_positive_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16323 +; CHECK-NEXT: movk w8, #63310, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #42 +; CHECK-NEXT: mov w9, #1060 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w9, w0, w8 +; CHECK-NEXT: add w8, w8, w9, lsr #1 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w9, w8, w9, w0 +; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: ret + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: and w0, w0, #0x3f +; CHECK-NEXT: ret + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #58849 +; CHECK-NEXT: movk x9, #48148, lsl #16 +; CHECK-NEXT: movk x9, #33436, lsl #32 +; CHECK-NEXT: lsr x8, x0, #1 +; CHECK-NEXT: movk x9, #21399, lsl #48 +; CHECK-NEXT: umulh x8, x8, x9 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: msub x0, x8, x9, x0 +; CHECK-NEXT: ret + %1 = urem i64 %x, 98 + ret i64 %1 +} Index: llvm/trunk/test/CodeGen/AArch64/urem-vector-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/urem-vector-lkk.ll +++ llvm/trunk/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -0,0 +1,267 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; CHECK-LABEL: fold_urem_vec_1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w11, #33437 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: movk w11, #21399, lsl #16 +; CHECK-NEXT: umull x11, w10, w11 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov w9, #16913 +; CHECK-NEXT: mov w12, #98 +; CHECK-NEXT: lsr x11, x11, #37 +; CHECK-NEXT: movk w9, #8456, lsl #16 +; CHECK-NEXT: msub w10, w11, w12, w10 +; CHECK-NEXT: ubfx w12, w8, #2, #14 +; CHECK-NEXT: umull x9, w12, w9 +; CHECK-NEXT: mov w11, #124 +; CHECK-NEXT: lsr x9, x9, #34 +; CHECK-NEXT: msub w8, w9, w11, w8 +; CHECK-NEXT: mov w9, #8969 +; CHECK-NEXT: umov w12, v0.h[0] +; CHECK-NEXT: movk w9, #22765, lsl #16 +; CHECK-NEXT: umull x9, w12, w9 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: sub w11, w12, w9 +; CHECK-NEXT: add w9, w9, w11, lsr #1 +; CHECK-NEXT: mov w11, #95 +; CHECK-NEXT: lsr w9, w9, #6 +; CHECK-NEXT: msub w9, w9, w11, w12 +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: mov w9, #2287 +; CHECK-NEXT: movk w9, #16727, lsl #16 +; CHECK-NEXT: umull x9, w11, w9 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #1003 +; CHECK-NEXT: lsr x9, x9, #40 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: msub w8, w9, w8, w11 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; CHECK-LABEL: fold_urem_vec_2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #8969 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: movk w9, #22765, lsl #16 +; CHECK-NEXT: umov w10, v0.h[0] +; CHECK-NEXT: umull x13, w8, w9 +; CHECK-NEXT: umov w11, v0.h[2] +; CHECK-NEXT: umull x14, w10, w9 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: umull x15, w11, w9 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: sub w16, w8, w13 +; CHECK-NEXT: umull x9, w12, w9 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w13, w13, w16, lsr #1 +; CHECK-NEXT: sub w16, w10, w14 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w14, w14, w16, lsr #1 +; CHECK-NEXT: sub w16, w11, w15 +; CHECK-NEXT: add w15, w15, w16, lsr #1 +; CHECK-NEXT: sub w16, w12, w9 +; CHECK-NEXT: add w9, w9, w16, lsr #1 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: lsr w13, w13, #6 +; CHECK-NEXT: msub w8, w13, w16, w8 +; CHECK-NEXT: lsr w13, w14, #6 +; CHECK-NEXT: msub w10, w13, w16, w10 +; CHECK-NEXT: lsr w13, w15, #6 +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: msub w11, w13, w16, w11 +; CHECK-NEXT: lsr w9, w9, #6 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: msub w8, w9, w16, w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umov w9, v0.h[1] +; CHECK-NEXT: umov w10, v0.h[0] +; CHECK-NEXT: umull x13, w9, w8 +; CHECK-NEXT: umov w11, v0.h[2] +; CHECK-NEXT: umull x14, w10, w8 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: umull x15, w11, w8 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: sub w16, w9, w13 +; CHECK-NEXT: umull x8, w12, w8 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w13, w13, w16, lsr #1 +; CHECK-NEXT: sub w16, w10, w14 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w14, w14, w16, lsr #1 +; CHECK-NEXT: sub w16, w11, w15 +; CHECK-NEXT: add w15, w15, w16, lsr #1 +; CHECK-NEXT: sub w16, w12, w8 +; CHECK-NEXT: add w8, w8, w16, lsr #1 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: lsr w14, w14, #6 +; CHECK-NEXT: lsr w13, w13, #6 +; CHECK-NEXT: msub w10, w14, w16, w10 +; CHECK-NEXT: lsr w15, w15, #6 +; CHECK-NEXT: msub w9, w13, w16, w9 +; CHECK-NEXT: fmov s0, w14 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: msub w11, w15, w16, w11 +; CHECK-NEXT: mov v0.h[1], w13 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: msub w12, w8, w16, w12 +; CHECK-NEXT: mov v0.h[2], w15 +; CHECK-NEXT: mov v1.h[2], w11 +; CHECK-NEXT: mov v1.h[3], w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #8969 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: movk w9, #22765, lsl #16 +; CHECK-NEXT: umull x9, w8, w9 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: sub w10, w8, w9 +; CHECK-NEXT: add w9, w9, w10, lsr #1 +; CHECK-NEXT: mov w10, #95 +; CHECK-NEXT: lsr w9, w9, #6 +; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: and w9, w9, #0x3f +; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: and w10, w10, #0x1f +; CHECK-NEXT: and w9, w9, #0x7 +; CHECK-NEXT: mov v1.h[1], w10 +; CHECK-NEXT: mov v1.h[2], w9 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #17097 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: movk w9, #45590, lsl #16 +; CHECK-NEXT: umull x9, w8, w9 +; CHECK-NEXT: mov w10, #23 +; CHECK-NEXT: lsr x9, x9, #36 +; CHECK-NEXT: umov w11, v0.h[1] +; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: mov w9, #30865 +; CHECK-NEXT: movk w9, #51306, lsl #16 +; CHECK-NEXT: ubfx w10, w11, #1, #15 +; CHECK-NEXT: umull x9, w10, w9 +; CHECK-NEXT: mov w10, #654 +; CHECK-NEXT: lsr x9, x9, #40 +; CHECK-NEXT: msub w9, w9, w10, w11 +; CHECK-NEXT: mov w11, #47143 +; CHECK-NEXT: umov w10, v0.h[3] +; CHECK-NEXT: movk w11, #24749, lsl #16 +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: umull x11, w10, w11 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: mov w9, #5423 +; CHECK-NEXT: lsr x11, x11, #43 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: msub w8, w11, w9, w10 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x10, #12109 +; CHECK-NEXT: movk x10, #52170, lsl #16 +; CHECK-NEXT: movk x10, #28749, lsl #32 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: movk x10, #49499, lsl #48 +; CHECK-NEXT: umulh x10, x8, x10 +; CHECK-NEXT: mov w11, #5423 +; CHECK-NEXT: lsr x10, x10, #12 +; CHECK-NEXT: msub x8, x10, x11, x8 +; CHECK-NEXT: mov x10, #21445 +; CHECK-NEXT: movk x10, #1603, lsl #16 +; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: movk x10, #15432, lsl #32 +; CHECK-NEXT: movk x10, #25653, lsl #48 +; CHECK-NEXT: lsr x11, x12, #1 +; CHECK-NEXT: umulh x10, x11, x10 +; CHECK-NEXT: mov w11, #654 +; CHECK-NEXT: lsr x10, x10, #7 +; CHECK-NEXT: msub x10, x10, x11, x12 +; CHECK-NEXT: mov x11, #17097 +; CHECK-NEXT: movk x11, #45590, lsl #16 +; CHECK-NEXT: movk x11, #34192, lsl #32 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: movk x11, #25644, lsl #48 +; CHECK-NEXT: umulh x11, x9, x11 +; CHECK-NEXT: sub x12, x9, x11 +; CHECK-NEXT: add x11, x11, x12, lsr #1 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: lsr x11, x11, #4 +; CHECK-NEXT: msub x9, x11, x12, x9 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.d[1], x10 +; CHECK-NEXT: ret + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/trunk/test/CodeGen/PowerPC/srem-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/srem-lkk.ll +++ llvm/trunk/test/CodeGen/PowerPC/srem-lkk.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck -check-prefixes=CHECK,CHECK64 %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck -check-prefixes=CHECK,CHECK32 %s + +define i32 @fold_srem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_srem_positive_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -21386 +; CHECK-NEXT: ori 4, 4, 37253 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: add 4, 4, 3 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 6 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 4, 4, 95 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positive_even(i32 %x) { +; CHECK-LABEL: fold_srem_positive_even: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, 15827 +; CHECK-NEXT: ori 4, 4, 36849 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 8 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 4, 4, 1060 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; CHECK-LABEL: fold_srem_negative_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -23206 +; CHECK-NEXT: ori 4, 4, 65445 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 8 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 4, 4, -723 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; CHECK-LABEL: fold_srem_negative_even: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -731 +; CHECK-NEXT: ori 4, 4, 62439 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 8 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 4, 4, -22981 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -21386 +; CHECK-NEXT: ori 4, 4, 37253 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: add 4, 4, 3 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 6 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 5, 4, 95 +; CHECK-NEXT: subf 3, 5, 3 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: blr + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: srawi 4, 3, 6 +; CHECK-NEXT: addze 4, 4 +; CHECK-NEXT: slwi 4, 4, 6 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: blr + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_fold_srem_i32_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: srawi 4, 3, 31 +; CHECK-NEXT: addze 4, 4 +; CHECK-NEXT: slwi 4, 4, 31 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: blr + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: stw 0, 4(1) +; CHECK-NEXT: stwu 1, -16(1) +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset lr, 4 +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: li 6, 98 +; CHECK-NEXT: bl __moddi3@PLT +; CHECK-NEXT: lwz 0, 20(1) +; CHECK-NEXT: addi 1, 1, 16 +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr + %1 = srem i64 %x, 98 + ret i64 %1 +} Index: llvm/trunk/test/CodeGen/PowerPC/srem-vector-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ llvm/trunk/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -0,0 +1,1675 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P9LE +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P9BE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8LE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8BE + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; P9LE-LABEL: fold_srem_vec_1: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, 31710 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 63421 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r4, r5 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, 21399 +; P9LE-NEXT: mulli r4, r4, -124 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 33437 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: rldicl r4, r4, 32, 32 +; P9LE-NEXT: srawi r4, r4, 5 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, -16728 +; P9LE-NEXT: mulli r4, r4, 98 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 63249 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: rldicl r4, r4, 32, 32 +; P9LE-NEXT: srawi r4, r4, 8 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, -1003 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: fold_srem_vec_1: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: lis r4, 31710 +; P9BE-NEXT: ori r4, r4, 63421 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r4, r3, r4 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, -124 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, -16728 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 63249 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: srawi r4, r4, 8 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, -1003 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, 21399 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 33437 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: srawi r4, r4, 5 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 98 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: blr +; +; P8LE-LABEL: fold_srem_vec_1: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r4, 21399 +; P8LE-NEXT: lis r9, -16728 +; P8LE-NEXT: lis r11, -21386 +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r4, r4, 33437 +; P8LE-NEXT: ori r9, r9, 63249 +; P8LE-NEXT: ori r11, r11, 37253 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: rldicl r3, r5, 32, 48 +; P8LE-NEXT: rldicl r6, r5, 16, 48 +; P8LE-NEXT: clrldi r7, r5, 48 +; P8LE-NEXT: extsh r8, r3 +; P8LE-NEXT: extsh r10, r6 +; P8LE-NEXT: rldicl r5, r5, 48, 48 +; P8LE-NEXT: extsw r8, r8 +; P8LE-NEXT: extsh r12, r7 +; P8LE-NEXT: extsw r10, r10 +; P8LE-NEXT: mulld r4, r8, r4 +; P8LE-NEXT: lis r8, 31710 +; P8LE-NEXT: extsh r0, r5 +; P8LE-NEXT: extsw r12, r12 +; P8LE-NEXT: mulld r9, r10, r9 +; P8LE-NEXT: ori r8, r8, 63421 +; P8LE-NEXT: extsw r10, r0 +; P8LE-NEXT: mulld r11, r12, r11 +; P8LE-NEXT: mulld r8, r10, r8 +; P8LE-NEXT: rldicl r0, r4, 1, 63 +; P8LE-NEXT: rldicl r4, r4, 32, 32 +; P8LE-NEXT: rldicl r30, r9, 1, 63 +; P8LE-NEXT: rldicl r9, r9, 32, 32 +; P8LE-NEXT: rldicl r11, r11, 32, 32 +; P8LE-NEXT: rldicl r8, r8, 32, 32 +; P8LE-NEXT: add r11, r11, r12 +; P8LE-NEXT: srawi r4, r4, 5 +; P8LE-NEXT: subf r8, r10, r8 +; P8LE-NEXT: srawi r9, r9, 8 +; P8LE-NEXT: srwi r10, r11, 31 +; P8LE-NEXT: add r4, r4, r0 +; P8LE-NEXT: srawi r11, r11, 6 +; P8LE-NEXT: add r9, r9, r30 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: add r10, r11, r10 +; P8LE-NEXT: srwi r11, r8, 31 +; P8LE-NEXT: srawi r8, r8, 6 +; P8LE-NEXT: mulli r4, r4, 98 +; P8LE-NEXT: mulli r9, r9, -1003 +; P8LE-NEXT: add r8, r8, r11 +; P8LE-NEXT: mulli r10, r10, 95 +; P8LE-NEXT: mulli r8, r8, -124 +; P8LE-NEXT: subf r3, r4, r3 +; P8LE-NEXT: subf r4, r9, r6 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r10, r7 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: subf r4, r8, r5 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v5, v4 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: fold_srem_vec_1: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, -16728 +; P8BE-NEXT: lis r9, 31710 +; P8BE-NEXT: lis r8, 21399 +; P8BE-NEXT: lis r10, -21386 +; P8BE-NEXT: ori r3, r3, 63249 +; P8BE-NEXT: ori r9, r9, 63421 +; P8BE-NEXT: ori r8, r8, 33437 +; P8BE-NEXT: ori r10, r10, 37253 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: extsw r7, r7 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: mulld r9, r7, r9 +; P8BE-NEXT: mulld r8, r6, r8 +; P8BE-NEXT: mulld r10, r4, r10 +; P8BE-NEXT: rldicl r11, r3, 1, 63 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: rldicl r9, r9, 32, 32 +; P8BE-NEXT: rldicl r12, r8, 1, 63 +; P8BE-NEXT: rldicl r8, r8, 32, 32 +; P8BE-NEXT: rldicl r10, r10, 32, 32 +; P8BE-NEXT: subf r9, r7, r9 +; P8BE-NEXT: srawi r3, r3, 8 +; P8BE-NEXT: srawi r8, r8, 5 +; P8BE-NEXT: add r10, r10, r4 +; P8BE-NEXT: add r3, r3, r11 +; P8BE-NEXT: srwi r11, r9, 31 +; P8BE-NEXT: add r8, r8, r12 +; P8BE-NEXT: srawi r9, r9, 6 +; P8BE-NEXT: mulli r3, r3, -1003 +; P8BE-NEXT: add r9, r9, r11 +; P8BE-NEXT: srwi r11, r10, 31 +; P8BE-NEXT: srawi r10, r10, 6 +; P8BE-NEXT: mulli r8, r8, 98 +; P8BE-NEXT: add r10, r10, r11 +; P8BE-NEXT: mulli r9, r9, -124 +; P8BE-NEXT: mulli r10, r10, 95 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: subf r5, r8, r6 +; P8BE-NEXT: mtvsrd v2, r3 +; P8BE-NEXT: subf r6, r9, r7 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: subf r4, r10, r4 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r3, r6, 48 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; P9LE-LABEL: fold_srem_vec_2: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r6, r4, r5 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: add r4, r6, r4 +; P9LE-NEXT: srwi r6, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r6, r4, r5 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: add r4, r6, r4 +; P9LE-NEXT: srwi r6, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r6, r4, r5 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: add r4, r6, r4 +; P9LE-NEXT: srwi r6, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: fold_srem_vec_2: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: add r5, r5, r3 +; P9BE-NEXT: srwi r6, r5, 31 +; P9BE-NEXT: srawi r5, r5, 6 +; P9BE-NEXT: add r5, r5, r6 +; P9BE-NEXT: mulli r5, r5, 95 +; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: add r5, r5, r3 +; P9BE-NEXT: srwi r6, r5, 31 +; P9BE-NEXT: srawi r5, r5, 6 +; P9BE-NEXT: add r5, r5, r6 +; P9BE-NEXT: mulli r5, r5, 95 +; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: add r5, r5, r3 +; P9BE-NEXT: srwi r6, r5, 31 +; P9BE-NEXT: srawi r5, r5, 6 +; P9BE-NEXT: add r5, r5, r6 +; P9BE-NEXT: mulli r5, r5, 95 +; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: fold_srem_vec_2: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r4, -21386 +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r4, r4, 37253 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: clrldi r3, r5, 48 +; P8LE-NEXT: rldicl r7, r5, 32, 48 +; P8LE-NEXT: extsh r8, r3 +; P8LE-NEXT: rldicl r6, r5, 48, 48 +; P8LE-NEXT: extsh r10, r7 +; P8LE-NEXT: rldicl r5, r5, 16, 48 +; P8LE-NEXT: extsw r8, r8 +; P8LE-NEXT: extsh r9, r6 +; P8LE-NEXT: extsw r10, r10 +; P8LE-NEXT: extsh r11, r5 +; P8LE-NEXT: mulld r12, r8, r4 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: extsw r11, r11 +; P8LE-NEXT: mulld r30, r10, r4 +; P8LE-NEXT: mulld r0, r9, r4 +; P8LE-NEXT: mulld r4, r11, r4 +; P8LE-NEXT: rldicl r12, r12, 32, 32 +; P8LE-NEXT: add r8, r12, r8 +; P8LE-NEXT: rldicl r12, r30, 32, 32 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: rldicl r0, r0, 32, 32 +; P8LE-NEXT: rldicl r4, r4, 32, 32 +; P8LE-NEXT: add r10, r12, r10 +; P8LE-NEXT: add r9, r0, r9 +; P8LE-NEXT: srwi r0, r8, 31 +; P8LE-NEXT: add r4, r4, r11 +; P8LE-NEXT: srwi r11, r10, 31 +; P8LE-NEXT: srawi r8, r8, 6 +; P8LE-NEXT: srawi r10, r10, 6 +; P8LE-NEXT: srwi r12, r9, 31 +; P8LE-NEXT: add r8, r8, r0 +; P8LE-NEXT: srawi r9, r9, 6 +; P8LE-NEXT: add r10, r10, r11 +; P8LE-NEXT: srwi r11, r4, 31 +; P8LE-NEXT: srawi r4, r4, 6 +; P8LE-NEXT: add r9, r9, r12 +; P8LE-NEXT: mulli r8, r8, 95 +; P8LE-NEXT: add r4, r4, r11 +; P8LE-NEXT: mulli r9, r9, 95 +; P8LE-NEXT: mulli r10, r10, 95 +; P8LE-NEXT: mulli r4, r4, 95 +; P8LE-NEXT: subf r3, r8, r3 +; P8LE-NEXT: subf r6, r9, r6 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r10, r7 +; P8LE-NEXT: subf r4, r4, r5 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v5, v4 +; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: fold_srem_vec_2: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, -21386 +; P8BE-NEXT: ori r3, r3, 37253 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: mulld r8, r5, r3 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: extsw r7, r7 +; P8BE-NEXT: mulld r9, r6, r3 +; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: mulld r10, r7, r3 +; P8BE-NEXT: mulld r3, r4, r3 +; P8BE-NEXT: rldicl r8, r8, 32, 32 +; P8BE-NEXT: rldicl r9, r9, 32, 32 +; P8BE-NEXT: add r8, r8, r5 +; P8BE-NEXT: rldicl r10, r10, 32, 32 +; P8BE-NEXT: add r9, r9, r6 +; P8BE-NEXT: srwi r11, r8, 31 +; P8BE-NEXT: srawi r8, r8, 6 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: add r10, r10, r7 +; P8BE-NEXT: add r8, r8, r11 +; P8BE-NEXT: srwi r11, r9, 31 +; P8BE-NEXT: add r3, r3, r4 +; P8BE-NEXT: srawi r9, r9, 6 +; P8BE-NEXT: mulli r8, r8, 95 +; P8BE-NEXT: add r9, r9, r11 +; P8BE-NEXT: srwi r11, r10, 31 +; P8BE-NEXT: srawi r10, r10, 6 +; P8BE-NEXT: mulli r9, r9, 95 +; P8BE-NEXT: add r10, r10, r11 +; P8BE-NEXT: srwi r11, r3, 31 +; P8BE-NEXT: srawi r3, r3, 6 +; P8BE-NEXT: mulli r10, r10, 95 +; P8BE-NEXT: subf r5, r8, r5 +; P8BE-NEXT: add r3, r3, r11 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: subf r6, r9, r6 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: subf r7, r10, r7 +; P8BE-NEXT: mtvsrd v3, r6 +; P8BE-NEXT: subf r3, r3, r4 +; P8BE-NEXT: sldi r4, r7, 48 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; P9LE-LABEL: combine_srem_sdiv: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r6, r4, r5 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: add r4, r6, r4 +; P9LE-NEXT: srwi r6, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r6 +; P9LE-NEXT: mulli r6, r4, 95 +; P9LE-NEXT: subf r3, r6, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r6, r3 +; P9LE-NEXT: extsw r6, r6 +; P9LE-NEXT: mulld r7, r6, r5 +; P9LE-NEXT: rldicl r7, r7, 32, 32 +; P9LE-NEXT: add r6, r7, r6 +; P9LE-NEXT: srwi r7, r6, 31 +; P9LE-NEXT: srawi r6, r6, 6 +; P9LE-NEXT: add r6, r6, r7 +; P9LE-NEXT: mulli r7, r6, 95 +; P9LE-NEXT: subf r3, r7, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r7, r3 +; P9LE-NEXT: extsw r7, r7 +; P9LE-NEXT: mulld r8, r7, r5 +; P9LE-NEXT: rldicl r8, r8, 32, 32 +; P9LE-NEXT: add r7, r8, r7 +; P9LE-NEXT: srwi r8, r7, 31 +; P9LE-NEXT: srawi r7, r7, 6 +; P9LE-NEXT: add r7, r7, r8 +; P9LE-NEXT: mulli r8, r7, 95 +; P9LE-NEXT: subf r3, r8, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r8, r3 +; P9LE-NEXT: extsw r8, r8 +; P9LE-NEXT: mulld r5, r8, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r5, r5, r8 +; P9LE-NEXT: srwi r8, r5, 31 +; P9LE-NEXT: srawi r5, r5, 6 +; P9LE-NEXT: add r5, r5, r8 +; P9LE-NEXT: mulli r8, r5, 95 +; P9LE-NEXT: subf r3, r8, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: mtvsrd f0, r4 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r6 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r7 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r5 +; P9LE-NEXT: xxswapd v5, vs0 +; P9LE-NEXT: vmrglh v4, v5, v4 +; P9LE-NEXT: vmrglw v3, v4, v3 +; P9LE-NEXT: vadduhm v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: combine_srem_sdiv: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r4, r3 +; P9BE-NEXT: lis r5, -21386 +; P9BE-NEXT: ori r5, r5, 37253 +; P9BE-NEXT: extsw r4, r4 +; P9BE-NEXT: mulld r6, r4, r5 +; P9BE-NEXT: rldicl r6, r6, 32, 32 +; P9BE-NEXT: add r4, r6, r4 +; P9BE-NEXT: srwi r6, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r6 +; P9BE-NEXT: mulli r6, r4, 95 +; P9BE-NEXT: subf r3, r6, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r6, r3 +; P9BE-NEXT: extsw r6, r6 +; P9BE-NEXT: mulld r7, r6, r5 +; P9BE-NEXT: rldicl r7, r7, 32, 32 +; P9BE-NEXT: add r6, r7, r6 +; P9BE-NEXT: srwi r7, r6, 31 +; P9BE-NEXT: srawi r6, r6, 6 +; P9BE-NEXT: add r6, r6, r7 +; P9BE-NEXT: mulli r7, r6, 95 +; P9BE-NEXT: subf r3, r7, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r7, r3 +; P9BE-NEXT: extsw r7, r7 +; P9BE-NEXT: mulld r8, r7, r5 +; P9BE-NEXT: rldicl r8, r8, 32, 32 +; P9BE-NEXT: add r7, r8, r7 +; P9BE-NEXT: srwi r8, r7, 31 +; P9BE-NEXT: srawi r7, r7, 6 +; P9BE-NEXT: add r7, r7, r8 +; P9BE-NEXT: mulli r8, r7, 95 +; P9BE-NEXT: subf r3, r8, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r5, r3, r5 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: add r5, r5, r3 +; P9BE-NEXT: srwi r8, r5, 31 +; P9BE-NEXT: srawi r5, r5, 6 +; P9BE-NEXT: add r5, r5, r8 +; P9BE-NEXT: mulli r8, r5, 95 +; P9BE-NEXT: subf r3, r8, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: sldi r3, r4, 48 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: sldi r3, r6, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: sldi r3, r7, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: sldi r3, r5, 48 +; P9BE-NEXT: mtvsrd v5, r3 +; P9BE-NEXT: vmrghh v4, v5, v4 +; P9BE-NEXT: vmrghw v3, v4, v3 +; P9BE-NEXT: vadduhm v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: combine_srem_sdiv: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r5, -21386 +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r5, r5, 37253 +; P8LE-NEXT: mfvsrd r6, f0 +; P8LE-NEXT: clrldi r3, r6, 48 +; P8LE-NEXT: rldicl r4, r6, 48, 48 +; P8LE-NEXT: rldicl r7, r6, 32, 48 +; P8LE-NEXT: extsh r8, r3 +; P8LE-NEXT: extsh r9, r4 +; P8LE-NEXT: rldicl r6, r6, 16, 48 +; P8LE-NEXT: extsh r10, r7 +; P8LE-NEXT: extsw r8, r8 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: extsh r11, r6 +; P8LE-NEXT: extsw r10, r10 +; P8LE-NEXT: mulld r12, r8, r5 +; P8LE-NEXT: extsw r11, r11 +; P8LE-NEXT: mulld r0, r9, r5 +; P8LE-NEXT: mulld r30, r10, r5 +; P8LE-NEXT: mulld r5, r11, r5 +; P8LE-NEXT: rldicl r12, r12, 32, 32 +; P8LE-NEXT: rldicl r0, r0, 32, 32 +; P8LE-NEXT: rldicl r30, r30, 32, 32 +; P8LE-NEXT: add r8, r12, r8 +; P8LE-NEXT: rldicl r5, r5, 32, 32 +; P8LE-NEXT: add r9, r0, r9 +; P8LE-NEXT: add r10, r30, r10 +; P8LE-NEXT: srwi r12, r8, 31 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: srawi r8, r8, 6 +; P8LE-NEXT: srawi r0, r9, 6 +; P8LE-NEXT: srwi r9, r9, 31 +; P8LE-NEXT: add r5, r5, r11 +; P8LE-NEXT: add r8, r8, r12 +; P8LE-NEXT: srawi r12, r10, 6 +; P8LE-NEXT: srwi r10, r10, 31 +; P8LE-NEXT: add r9, r0, r9 +; P8LE-NEXT: mulli r0, r8, 95 +; P8LE-NEXT: add r10, r12, r10 +; P8LE-NEXT: mtvsrd f0, r8 +; P8LE-NEXT: srwi r8, r5, 31 +; P8LE-NEXT: srawi r5, r5, 6 +; P8LE-NEXT: mulli r11, r9, 95 +; P8LE-NEXT: mtvsrd f1, r9 +; P8LE-NEXT: mulli r9, r10, 95 +; P8LE-NEXT: add r5, r5, r8 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f2, r10 +; P8LE-NEXT: mtvsrd f3, r5 +; P8LE-NEXT: mulli r5, r5, 95 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: subf r3, r0, r3 +; P8LE-NEXT: xxswapd v1, vs2 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r4, r11, r4 +; P8LE-NEXT: xxswapd v6, vs3 +; P8LE-NEXT: subf r3, r9, r7 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: mtvsrd f4, r3 +; P8LE-NEXT: subf r3, r5, r6 +; P8LE-NEXT: mtvsrd f5, r3 +; P8LE-NEXT: xxswapd v4, vs1 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: xxswapd v3, vs0 +; P8LE-NEXT: xxswapd v5, vs4 +; P8LE-NEXT: xxswapd v0, vs5 +; P8LE-NEXT: vmrglh v3, v4, v3 +; P8LE-NEXT: vmrglh v4, v0, v5 +; P8LE-NEXT: vmrglh v5, v6, v1 +; P8LE-NEXT: vmrglw v3, v4, v3 +; P8LE-NEXT: vmrglw v2, v5, v2 +; P8LE-NEXT: vadduhm v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: combine_srem_sdiv: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r6, v2 +; P8BE-NEXT: lis r5, -21386 +; P8BE-NEXT: ori r5, r5, 37253 +; P8BE-NEXT: clrldi r3, r6, 48 +; P8BE-NEXT: rldicl r4, r6, 48, 48 +; P8BE-NEXT: extsh r8, r3 +; P8BE-NEXT: rldicl r7, r6, 32, 48 +; P8BE-NEXT: extsh r9, r4 +; P8BE-NEXT: rldicl r6, r6, 16, 48 +; P8BE-NEXT: extsw r8, r8 +; P8BE-NEXT: extsh r10, r7 +; P8BE-NEXT: extsw r9, r9 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: mulld r11, r8, r5 +; P8BE-NEXT: extsw r10, r10 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: mulld r12, r9, r5 +; P8BE-NEXT: mulld r0, r10, r5 +; P8BE-NEXT: mulld r5, r6, r5 +; P8BE-NEXT: rldicl r11, r11, 32, 32 +; P8BE-NEXT: rldicl r12, r12, 32, 32 +; P8BE-NEXT: add r8, r11, r8 +; P8BE-NEXT: rldicl r0, r0, 32, 32 +; P8BE-NEXT: rldicl r5, r5, 32, 32 +; P8BE-NEXT: add r9, r12, r9 +; P8BE-NEXT: srawi r11, r8, 6 +; P8BE-NEXT: srwi r8, r8, 31 +; P8BE-NEXT: add r10, r0, r10 +; P8BE-NEXT: add r5, r5, r6 +; P8BE-NEXT: srawi r12, r9, 6 +; P8BE-NEXT: srwi r9, r9, 31 +; P8BE-NEXT: add r8, r11, r8 +; P8BE-NEXT: srawi r0, r10, 6 +; P8BE-NEXT: srawi r11, r5, 6 +; P8BE-NEXT: srwi r10, r10, 31 +; P8BE-NEXT: add r9, r12, r9 +; P8BE-NEXT: srwi r5, r5, 31 +; P8BE-NEXT: mulli r12, r8, 95 +; P8BE-NEXT: add r10, r0, r10 +; P8BE-NEXT: add r5, r11, r5 +; P8BE-NEXT: mulli r0, r9, 95 +; P8BE-NEXT: sldi r9, r9, 48 +; P8BE-NEXT: sldi r8, r8, 48 +; P8BE-NEXT: mtvsrd v3, r9 +; P8BE-NEXT: mulli r9, r5, 95 +; P8BE-NEXT: mtvsrd v2, r8 +; P8BE-NEXT: mulli r8, r10, 95 +; P8BE-NEXT: sldi r10, r10, 48 +; P8BE-NEXT: subf r3, r12, r3 +; P8BE-NEXT: mtvsrd v4, r10 +; P8BE-NEXT: subf r4, r0, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: subf r3, r9, r6 +; P8BE-NEXT: subf r7, r8, r7 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: sldi r6, r7, 48 +; P8BE-NEXT: mtvsrd v1, r3 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: mtvsrd v0, r6 +; P8BE-NEXT: vmrghh v3, v5, v3 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v0, v1, v0 +; P8BE-NEXT: vmrghh v4, v5, v4 +; P8BE-NEXT: vmrghw v3, v0, v3 +; P8BE-NEXT: vmrghw v2, v4, v2 +; P8BE-NEXT: vadduhm v2, v3, v2 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_srem_power_of_two: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: addze r4, r4 +; P9LE-NEXT: slwi r4, r4, 6 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: srawi r4, r4, 5 +; P9LE-NEXT: addze r4, r4 +; P9LE-NEXT: slwi r4, r4, 5 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: srawi r4, r4, 3 +; P9LE-NEXT: addze r4, r4 +; P9LE-NEXT: slwi r4, r4, 3 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_srem_power_of_two: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 5 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 5 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 6 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 6 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 3 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 3 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_srem_power_of_two: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r3, -21386 +; P8LE-NEXT: ori r3, r3, 37253 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: rldicl r5, r4, 16, 48 +; P8LE-NEXT: clrldi r7, r4, 48 +; P8LE-NEXT: extsh r6, r5 +; P8LE-NEXT: extsh r8, r7 +; P8LE-NEXT: extsw r6, r6 +; P8LE-NEXT: rldicl r9, r4, 48, 48 +; P8LE-NEXT: mulld r3, r6, r3 +; P8LE-NEXT: srawi r8, r8, 6 +; P8LE-NEXT: extsh r10, r9 +; P8LE-NEXT: addze r8, r8 +; P8LE-NEXT: rldicl r4, r4, 32, 48 +; P8LE-NEXT: srawi r10, r10, 5 +; P8LE-NEXT: slwi r8, r8, 6 +; P8LE-NEXT: subf r7, r8, r7 +; P8LE-NEXT: rldicl r3, r3, 32, 32 +; P8LE-NEXT: mtvsrd f0, r7 +; P8LE-NEXT: add r3, r3, r6 +; P8LE-NEXT: addze r6, r10 +; P8LE-NEXT: srwi r10, r3, 31 +; P8LE-NEXT: srawi r3, r3, 6 +; P8LE-NEXT: slwi r6, r6, 5 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: add r3, r3, r10 +; P8LE-NEXT: extsh r10, r4 +; P8LE-NEXT: subf r6, r6, r9 +; P8LE-NEXT: mulli r3, r3, 95 +; P8LE-NEXT: srawi r8, r10, 3 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: addze r7, r8 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: subf r3, r3, r5 +; P8LE-NEXT: slwi r5, r7, 3 +; P8LE-NEXT: subf r4, r5, r4 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_srem_power_of_two: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, -21386 +; P8BE-NEXT: ori r3, r3, 37253 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 32, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: rldicl r7, r4, 16, 48 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: srawi r8, r6, 5 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: addze r8, r8 +; P8BE-NEXT: rldicl r4, r4, 48, 48 +; P8BE-NEXT: srawi r9, r7, 6 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: slwi r8, r8, 5 +; P8BE-NEXT: addze r9, r9 +; P8BE-NEXT: subf r6, r8, r6 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: slwi r8, r9, 6 +; P8BE-NEXT: add r3, r3, r5 +; P8BE-NEXT: subf r7, r8, r7 +; P8BE-NEXT: srwi r10, r3, 31 +; P8BE-NEXT: srawi r3, r3, 6 +; P8BE-NEXT: add r3, r3, r10 +; P8BE-NEXT: srawi r9, r4, 3 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: addze r8, r9 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: slwi r6, r8, 3 +; P8BE-NEXT: subf r4, r6, r4 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: sldi r5, r7, 48 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v3, r5 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_srem_one: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -14230 +; P9LE-NEXT: ori r5, r5, 30865 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 9 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, -19946 +; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 17097 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 4 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, 24749 +; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 47143 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: rldicl r4, r4, 32, 32 +; P9LE-NEXT: srawi r4, r4, 11 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: vmrglh v3, v3, v4 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_srem_one: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: lis r4, -19946 +; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 4 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 47143 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: srawi r4, r4, 11 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, -14230 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 30865 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 9 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 654 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v3, v4 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: vmrghh v2, v4, v2 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_srem_one: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r3, 24749 +; P8LE-NEXT: lis r8, -19946 +; P8LE-NEXT: lis r10, -14230 +; P8LE-NEXT: xxlxor v5, v5, v5 +; P8LE-NEXT: ori r3, r3, 47143 +; P8LE-NEXT: ori r8, r8, 17097 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: rldicl r5, r4, 16, 48 +; P8LE-NEXT: rldicl r6, r4, 32, 48 +; P8LE-NEXT: rldicl r4, r4, 48, 48 +; P8LE-NEXT: extsh r7, r5 +; P8LE-NEXT: extsh r9, r6 +; P8LE-NEXT: extsw r7, r7 +; P8LE-NEXT: extsh r11, r4 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: mulld r3, r7, r3 +; P8LE-NEXT: ori r7, r10, 30865 +; P8LE-NEXT: extsw r10, r11 +; P8LE-NEXT: mulld r8, r9, r8 +; P8LE-NEXT: mulld r7, r10, r7 +; P8LE-NEXT: rldicl r11, r3, 1, 63 +; P8LE-NEXT: rldicl r3, r3, 32, 32 +; P8LE-NEXT: rldicl r8, r8, 32, 32 +; P8LE-NEXT: rldicl r7, r7, 32, 32 +; P8LE-NEXT: add r8, r8, r9 +; P8LE-NEXT: srawi r3, r3, 11 +; P8LE-NEXT: add r7, r7, r10 +; P8LE-NEXT: srwi r9, r8, 31 +; P8LE-NEXT: srawi r8, r8, 4 +; P8LE-NEXT: add r3, r3, r11 +; P8LE-NEXT: add r8, r8, r9 +; P8LE-NEXT: srwi r9, r7, 31 +; P8LE-NEXT: srawi r7, r7, 9 +; P8LE-NEXT: mulli r3, r3, 5423 +; P8LE-NEXT: add r7, r7, r9 +; P8LE-NEXT: mulli r8, r8, 23 +; P8LE-NEXT: mulli r7, r7, 654 +; P8LE-NEXT: subf r3, r3, r5 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r8, r6 +; P8LE-NEXT: subf r4, r7, r4 +; P8LE-NEXT: mtvsrd f1, r3 +; P8LE-NEXT: mtvsrd f2, r4 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v2, v2, v3 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_srem_one: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 24749 +; P8BE-NEXT: lis r7, -19946 +; P8BE-NEXT: lis r8, -14230 +; P8BE-NEXT: ori r3, r3, 47143 +; P8BE-NEXT: ori r7, r7, 17097 +; P8BE-NEXT: ori r8, r8, 30865 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: mulld r7, r6, r7 +; P8BE-NEXT: mulld r8, r4, r8 +; P8BE-NEXT: rldicl r9, r3, 1, 63 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: rldicl r7, r7, 32, 32 +; P8BE-NEXT: rldicl r8, r8, 32, 32 +; P8BE-NEXT: srawi r3, r3, 11 +; P8BE-NEXT: add r7, r7, r6 +; P8BE-NEXT: add r8, r8, r4 +; P8BE-NEXT: add r3, r3, r9 +; P8BE-NEXT: srwi r9, r7, 31 +; P8BE-NEXT: srawi r7, r7, 4 +; P8BE-NEXT: mulli r3, r3, 5423 +; P8BE-NEXT: add r7, r7, r9 +; P8BE-NEXT: srwi r9, r8, 31 +; P8BE-NEXT: srawi r8, r8, 9 +; P8BE-NEXT: mulli r7, r7, 23 +; P8BE-NEXT: add r8, r8, r9 +; P8BE-NEXT: li r9, 0 +; P8BE-NEXT: mulli r8, r8, 654 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: sldi r5, r9, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: subf r5, r7, r6 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: subf r4, r8, r4 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: vmrghh v3, v4, v3 +; P8BE-NEXT: vmrghh v2, v2, v5 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_urem_i16_smax: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -19946 +; P9LE-NEXT: ori r5, r5, 17097 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 4 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, 24749 +; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 47143 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: rldicl r4, r4, 32, 32 +; P9LE-NEXT: srawi r4, r4, 11 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: srawi r4, r4, 15 +; P9LE-NEXT: addze r4, r4 +; P9LE-NEXT: slwi r4, r4, 15 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_urem_i16_smax: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: lis r4, -19946 +; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 4 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 47143 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: srawi r4, r4, 11 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 15 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 15 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v3, v4 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: vmrghh v2, v4, v2 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_urem_i16_smax: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r6, 24749 +; P8LE-NEXT: lis r7, -19946 +; P8LE-NEXT: xxlxor v5, v5, v5 +; P8LE-NEXT: ori r6, r6, 47143 +; P8LE-NEXT: ori r7, r7, 17097 +; P8LE-NEXT: mfvsrd r3, f0 +; P8LE-NEXT: rldicl r4, r3, 16, 48 +; P8LE-NEXT: rldicl r5, r3, 32, 48 +; P8LE-NEXT: extsh r8, r4 +; P8LE-NEXT: extsh r9, r5 +; P8LE-NEXT: extsw r8, r8 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: mulld r6, r8, r6 +; P8LE-NEXT: mulld r7, r9, r7 +; P8LE-NEXT: rldicl r3, r3, 48, 48 +; P8LE-NEXT: rldicl r8, r6, 32, 32 +; P8LE-NEXT: rldicl r7, r7, 32, 32 +; P8LE-NEXT: rldicl r6, r6, 1, 63 +; P8LE-NEXT: srawi r8, r8, 11 +; P8LE-NEXT: add r7, r7, r9 +; P8LE-NEXT: add r6, r8, r6 +; P8LE-NEXT: srwi r8, r7, 31 +; P8LE-NEXT: srawi r7, r7, 4 +; P8LE-NEXT: mulli r6, r6, 5423 +; P8LE-NEXT: add r7, r7, r8 +; P8LE-NEXT: extsh r8, r3 +; P8LE-NEXT: mulli r7, r7, 23 +; P8LE-NEXT: srawi r8, r8, 15 +; P8LE-NEXT: subf r4, r6, r4 +; P8LE-NEXT: addze r6, r8 +; P8LE-NEXT: mtvsrd f0, r4 +; P8LE-NEXT: slwi r4, r6, 15 +; P8LE-NEXT: subf r5, r7, r5 +; P8LE-NEXT: subf r3, r4, r3 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v2, v2, v3 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_urem_i16_smax: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 24749 +; P8BE-NEXT: lis r7, -19946 +; P8BE-NEXT: ori r3, r3, 47143 +; P8BE-NEXT: ori r7, r7, 17097 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: mulld r7, r6, r7 +; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: rldicl r8, r3, 1, 63 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: rldicl r7, r7, 32, 32 +; P8BE-NEXT: srawi r3, r3, 11 +; P8BE-NEXT: add r7, r7, r6 +; P8BE-NEXT: add r3, r3, r8 +; P8BE-NEXT: srwi r8, r7, 31 +; P8BE-NEXT: srawi r7, r7, 4 +; P8BE-NEXT: mulli r3, r3, 5423 +; P8BE-NEXT: add r7, r7, r8 +; P8BE-NEXT: li r8, 0 +; P8BE-NEXT: mulli r7, r7, 23 +; P8BE-NEXT: srawi r9, r4, 15 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: sldi r5, r8, 48 +; P8BE-NEXT: addze r8, r9 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: subf r5, r7, r6 +; P8BE-NEXT: slwi r6, r8, 15 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: subf r4, r6, r4 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: vmrghh v3, v4, v3 +; P8BE-NEXT: vmrghh v2, v2, v5 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; P9LE-LABEL: dont_fold_srem_i64: +; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 24749 +; P9LE-NEXT: ori r4, r4, 47142 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 58853 +; P9LE-NEXT: mfvsrd r3, v3 +; P9LE-NEXT: ori r4, r4, 6055 +; P9LE-NEXT: mulhd r4, r3, r4 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: sradi r4, r4, 11 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, -19946 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: ori r5, r5, 17096 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 22795 +; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: mfvsrld r4, v3 +; P9LE-NEXT: ori r5, r5, 8549 +; P9LE-NEXT: mulhd r5, r4, r5 +; P9LE-NEXT: add r5, r5, r4 +; P9LE-NEXT: rldicl r6, r5, 1, 63 +; P9LE-NEXT: sradi r5, r5, 4 +; P9LE-NEXT: add r5, r5, r6 +; P9LE-NEXT: mulli r5, r5, 23 +; P9LE-NEXT: sub r4, r4, r5 +; P9LE-NEXT: mtvsrdd v3, r3, r4 +; P9LE-NEXT: lis r4, 25653 +; P9LE-NEXT: ori r4, r4, 15432 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 1603 +; P9LE-NEXT: mfvsrd r3, v2 +; P9LE-NEXT: ori r4, r4, 21445 +; P9LE-NEXT: mulhd r4, r3, r4 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: sradi r4, r4, 8 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: li r4, 0 +; P9LE-NEXT: mtvsrdd v2, r3, r4 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_srem_i64: +; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: ori r4, r4, 47142 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 58853 +; P9BE-NEXT: mfvsrld r3, v3 +; P9BE-NEXT: ori r4, r4, 6055 +; P9BE-NEXT: mulhd r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: sradi r4, r4, 11 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: lis r5, -19946 +; P9BE-NEXT: ori r5, r5, 17096 +; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 22795 +; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: mfvsrd r4, v3 +; P9BE-NEXT: ori r5, r5, 8549 +; P9BE-NEXT: mulhd r5, r4, r5 +; P9BE-NEXT: add r5, r5, r4 +; P9BE-NEXT: rldicl r6, r5, 1, 63 +; P9BE-NEXT: sradi r5, r5, 4 +; P9BE-NEXT: add r5, r5, r6 +; P9BE-NEXT: mulli r5, r5, 23 +; P9BE-NEXT: sub r4, r4, r5 +; P9BE-NEXT: mtvsrdd v3, r4, r3 +; P9BE-NEXT: lis r4, 25653 +; P9BE-NEXT: ori r4, r4, 15432 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 1603 +; P9BE-NEXT: mfvsrld r3, v2 +; P9BE-NEXT: ori r4, r4, 21445 +; P9BE-NEXT: mulhd r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: sradi r4, r4, 8 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 654 +; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: mtvsrdd v2, 0, r3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_srem_i64: +; P8LE: # %bb.0: +; P8LE-NEXT: lis r3, 24749 +; P8LE-NEXT: lis r4, -19946 +; P8LE-NEXT: lis r5, 25653 +; P8LE-NEXT: xxswapd vs0, v3 +; P8LE-NEXT: mfvsrd r6, v3 +; P8LE-NEXT: ori r3, r3, 47142 +; P8LE-NEXT: ori r4, r4, 17096 +; P8LE-NEXT: ori r5, r5, 15432 +; P8LE-NEXT: mfvsrd r7, v2 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: oris r3, r3, 58853 +; P8LE-NEXT: oris r4, r4, 22795 +; P8LE-NEXT: mfvsrd r8, f0 +; P8LE-NEXT: oris r5, r5, 1603 +; P8LE-NEXT: ori r3, r3, 6055 +; P8LE-NEXT: ori r4, r4, 8549 +; P8LE-NEXT: ori r5, r5, 21445 +; P8LE-NEXT: mulhd r3, r6, r3 +; P8LE-NEXT: mulhd r5, r7, r5 +; P8LE-NEXT: mulhd r4, r8, r4 +; P8LE-NEXT: rldicl r9, r3, 1, 63 +; P8LE-NEXT: sradi r3, r3, 11 +; P8LE-NEXT: add r3, r3, r9 +; P8LE-NEXT: rldicl r9, r5, 1, 63 +; P8LE-NEXT: add r4, r4, r8 +; P8LE-NEXT: sradi r5, r5, 8 +; P8LE-NEXT: mulli r3, r3, 5423 +; P8LE-NEXT: add r5, r5, r9 +; P8LE-NEXT: rldicl r9, r4, 1, 63 +; P8LE-NEXT: sradi r4, r4, 4 +; P8LE-NEXT: mulli r5, r5, 654 +; P8LE-NEXT: add r4, r4, r9 +; P8LE-NEXT: mulli r4, r4, 23 +; P8LE-NEXT: sub r3, r6, r3 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: sub r5, r7, r5 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: sub r3, r8, r4 +; P8LE-NEXT: li r4, 0 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxmrghd v3, vs0, vs2 +; P8LE-NEXT: xxmrghd v2, vs1, vs3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_srem_i64: +; P8BE: # %bb.0: +; P8BE-NEXT: lis r4, -19946 +; P8BE-NEXT: lis r3, 24749 +; P8BE-NEXT: xxswapd vs0, v3 +; P8BE-NEXT: lis r5, 25653 +; P8BE-NEXT: xxswapd vs1, v2 +; P8BE-NEXT: ori r4, r4, 17096 +; P8BE-NEXT: ori r3, r3, 47142 +; P8BE-NEXT: ori r5, r5, 15432 +; P8BE-NEXT: mfvsrd r6, v3 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: oris r4, r4, 22795 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: oris r3, r3, 58853 +; P8BE-NEXT: mfvsrd r7, f0 +; P8BE-NEXT: ori r4, r4, 8549 +; P8BE-NEXT: ori r3, r3, 6055 +; P8BE-NEXT: oris r5, r5, 1603 +; P8BE-NEXT: mfvsrd r8, f1 +; P8BE-NEXT: mulhd r4, r6, r4 +; P8BE-NEXT: mulhd r3, r7, r3 +; P8BE-NEXT: ori r5, r5, 21445 +; P8BE-NEXT: mulhd r5, r8, r5 +; P8BE-NEXT: add r4, r4, r6 +; P8BE-NEXT: rldicl r9, r3, 1, 63 +; P8BE-NEXT: sradi r3, r3, 11 +; P8BE-NEXT: rldicl r10, r4, 1, 63 +; P8BE-NEXT: sradi r4, r4, 4 +; P8BE-NEXT: add r3, r3, r9 +; P8BE-NEXT: rldicl r9, r5, 1, 63 +; P8BE-NEXT: add r4, r4, r10 +; P8BE-NEXT: sradi r5, r5, 8 +; P8BE-NEXT: mulli r3, r3, 5423 +; P8BE-NEXT: add r5, r5, r9 +; P8BE-NEXT: mulli r4, r4, 23 +; P8BE-NEXT: mulli r5, r5, 654 +; P8BE-NEXT: sub r3, r7, r3 +; P8BE-NEXT: sub r4, r6, r4 +; P8BE-NEXT: mtvsrd f0, r3 +; P8BE-NEXT: sub r3, r8, r5 +; P8BE-NEXT: mtvsrd f1, r4 +; P8BE-NEXT: li r4, 0 +; P8BE-NEXT: mtvsrd f2, r3 +; P8BE-NEXT: mtvsrd f3, r4 +; P8BE-NEXT: xxmrghd v3, vs1, vs0 +; P8BE-NEXT: xxmrghd v2, vs3, vs2 +; P8BE-NEXT: blr + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/trunk/test/CodeGen/PowerPC/urem-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/urem-lkk.ll +++ llvm/trunk/test/CodeGen/PowerPC/urem-lkk.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck -check-prefixes=CHECK,CHECK64 %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck -check-prefixes=CHECK,CHECK32 %s + +define i32 @fold_urem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_urem_positive_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, 22765 +; CHECK-NEXT: ori 4, 4, 8969 +; CHECK-NEXT: mulhwu 4, 3, 4 +; CHECK-NEXT: subf 5, 4, 3 +; CHECK-NEXT: srwi 5, 5, 1 +; CHECK-NEXT: add 4, 5, 4 +; CHECK-NEXT: srwi 4, 4, 6 +; CHECK-NEXT: mulli 4, 4, 95 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positive_even(i32 %x) { +; CHECK-LABEL: fold_urem_positive_even: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -2226 +; CHECK-NEXT: ori 4, 4, 16323 +; CHECK-NEXT: mulhwu 4, 3, 4 +; CHECK-NEXT: srwi 4, 4, 10 +; CHECK-NEXT: mulli 4, 4, 1060 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, 22765 +; CHECK-NEXT: ori 4, 4, 8969 +; CHECK-NEXT: mulhwu 4, 3, 4 +; CHECK-NEXT: subf 5, 4, 3 +; CHECK-NEXT: srwi 5, 5, 1 +; CHECK-NEXT: add 4, 5, 4 +; CHECK-NEXT: srwi 4, 4, 6 +; CHECK-NEXT: mulli 5, 4, 95 +; CHECK-NEXT: subf 3, 5, 3 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: blr + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: clrlwi 3, 3, 26 +; CHECK-NEXT: blr + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: blr + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: # %bb.0: +; CHECK-NEXT: blr + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: stw 0, 4(1) +; CHECK-NEXT: stwu 1, -16(1) +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset lr, 4 +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: li 6, 98 +; CHECK-NEXT: bl __umoddi3@PLT +; CHECK-NEXT: lwz 0, 20(1) +; CHECK-NEXT: addi 1, 1, 16 +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr + %1 = urem i64 %x, 98 + ret i64 %1 +} Index: llvm/trunk/test/CodeGen/PowerPC/urem-vector-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ llvm/trunk/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -0,0 +1,1338 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P9LE +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P9BE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8LE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8BE + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; P9LE-LABEL: fold_urem_vec_1: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: lis r5, 21399 +; P9LE-NEXT: ori r5, r5, 33437 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: lis r5, 16727 +; P9LE-NEXT: ori r5, r5, 2287 +; P9LE-NEXT: rldicl r4, r4, 27, 37 +; P9LE-NEXT: mulli r4, r4, 98 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: lis r5, 8456 +; P9LE-NEXT: ori r5, r5, 16913 +; P9LE-NEXT: rldicl r4, r4, 24, 40 +; P9LE-NEXT: mulli r4, r4, 1003 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 30, 18, 31 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r4, r4, 30, 34 +; P9LE-NEXT: mulli r4, r4, 124 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: blr +; +; P9BE-LABEL: fold_urem_vec_1: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: lis r5, 16727 +; P9BE-NEXT: ori r5, r5, 2287 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: lis r5, 21399 +; P9BE-NEXT: ori r5, r5, 33437 +; P9BE-NEXT: rldicl r4, r4, 24, 40 +; P9BE-NEXT: mulli r4, r4, 1003 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: lis r5, 8456 +; P9BE-NEXT: ori r5, r5, 16913 +; P9BE-NEXT: rldicl r4, r4, 27, 37 +; P9BE-NEXT: mulli r4, r4, 98 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: clrlwi r4, r3, 16 +; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: lis r5, 22765 +; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: rldicl r3, r3, 30, 34 +; P9BE-NEXT: mulli r3, r3, 124 +; P9BE-NEXT: subf r3, r3, r4 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r5, r4, r3 +; P9BE-NEXT: srwi r5, r5, 1 +; P9BE-NEXT: add r4, r5, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: fold_urem_vec_1: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r3, 22765 +; P8LE-NEXT: lis r8, 21399 +; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: ori r8, r8, 33437 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rldicl r9, r4, 32, 48 +; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 +; P8LE-NEXT: rldicl r10, r4, 16, 48 +; P8LE-NEXT: rlwinm r11, r9, 0, 16, 31 +; P8LE-NEXT: clrldi r7, r6, 32 +; P8LE-NEXT: rlwinm r12, r10, 0, 16, 31 +; P8LE-NEXT: mulld r3, r7, r3 +; P8LE-NEXT: lis r7, 16727 +; P8LE-NEXT: ori r7, r7, 2287 +; P8LE-NEXT: mulld r8, r11, r8 +; P8LE-NEXT: lis r11, 8456 +; P8LE-NEXT: rldicl r4, r4, 48, 48 +; P8LE-NEXT: mulld r7, r12, r7 +; P8LE-NEXT: ori r11, r11, 16913 +; P8LE-NEXT: rlwinm r12, r4, 30, 18, 31 +; P8LE-NEXT: rldicl r3, r3, 32, 32 +; P8LE-NEXT: mulld r11, r12, r11 +; P8LE-NEXT: subf r6, r3, r6 +; P8LE-NEXT: rldicl r8, r8, 27, 37 +; P8LE-NEXT: srwi r6, r6, 1 +; P8LE-NEXT: add r3, r6, r3 +; P8LE-NEXT: rldicl r6, r7, 24, 40 +; P8LE-NEXT: mulli r7, r8, 98 +; P8LE-NEXT: srwi r3, r3, 6 +; P8LE-NEXT: rldicl r8, r11, 30, 34 +; P8LE-NEXT: mulli r6, r6, 1003 +; P8LE-NEXT: mulli r3, r3, 95 +; P8LE-NEXT: mulli r8, r8, 124 +; P8LE-NEXT: subf r7, r7, r9 +; P8LE-NEXT: subf r6, r6, r10 +; P8LE-NEXT: mtvsrd f0, r7 +; P8LE-NEXT: subf r3, r3, r5 +; P8LE-NEXT: subf r4, r8, r4 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v5, v4 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: fold_urem_vec_1: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 22765 +; P8BE-NEXT: lis r9, 16727 +; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: ori r9, r9, 2287 +; P8BE-NEXT: rldicl r5, r4, 16, 48 +; P8BE-NEXT: clrldi r6, r4, 48 +; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rldicl r7, r4, 48, 48 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: clrldi r8, r5, 32 +; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8BE-NEXT: mulld r3, r8, r3 +; P8BE-NEXT: lis r8, 21399 +; P8BE-NEXT: clrldi r10, r6, 32 +; P8BE-NEXT: ori r8, r8, 33437 +; P8BE-NEXT: clrldi r11, r7, 32 +; P8BE-NEXT: mulld r9, r10, r9 +; P8BE-NEXT: lis r10, 8456 +; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: mulld r8, r11, r8 +; P8BE-NEXT: ori r10, r10, 16913 +; P8BE-NEXT: rlwinm r11, r4, 30, 18, 31 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 +; P8BE-NEXT: mulld r10, r11, r10 +; P8BE-NEXT: subf r11, r3, r5 +; P8BE-NEXT: srwi r11, r11, 1 +; P8BE-NEXT: rldicl r9, r9, 24, 40 +; P8BE-NEXT: add r3, r11, r3 +; P8BE-NEXT: rldicl r8, r8, 27, 37 +; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: mulli r9, r9, 1003 +; P8BE-NEXT: rldicl r10, r10, 30, 34 +; P8BE-NEXT: mulli r8, r8, 98 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: mulli r10, r10, 124 +; P8BE-NEXT: subf r6, r9, r6 +; P8BE-NEXT: subf r7, r8, r7 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: subf r4, r10, r4 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: sldi r5, r7, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r5 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v4, v5 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; P9LE-LABEL: fold_urem_vec_2: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: fold_urem_vec_2: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: lis r5, 22765 +; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r6, r4, r3 +; P9BE-NEXT: srwi r6, r6, 1 +; P9BE-NEXT: add r4, r6, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r6, r4, r3 +; P9BE-NEXT: srwi r6, r6, 1 +; P9BE-NEXT: add r4, r6, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r6, r4, r3 +; P9BE-NEXT: srwi r6, r6, 1 +; P9BE-NEXT: add r4, r6, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r5, r4, r3 +; P9BE-NEXT: srwi r5, r5, 1 +; P9BE-NEXT: add r4, r5, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: fold_urem_vec_2: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r4, 22765 +; P8LE-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r4, r4, 8969 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: clrldi r3, r5, 48 +; P8LE-NEXT: rldicl r6, r5, 48, 48 +; P8LE-NEXT: rlwinm r8, r3, 0, 16, 31 +; P8LE-NEXT: rldicl r7, r5, 32, 48 +; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 +; P8LE-NEXT: rldicl r5, r5, 16, 48 +; P8LE-NEXT: clrldi r11, r8, 32 +; P8LE-NEXT: rlwinm r10, r7, 0, 16, 31 +; P8LE-NEXT: rlwinm r12, r5, 0, 16, 31 +; P8LE-NEXT: mulld r11, r11, r4 +; P8LE-NEXT: clrldi r0, r9, 32 +; P8LE-NEXT: clrldi r30, r10, 32 +; P8LE-NEXT: clrldi r29, r12, 32 +; P8LE-NEXT: mulld r0, r0, r4 +; P8LE-NEXT: mulld r30, r30, r4 +; P8LE-NEXT: mulld r4, r29, r4 +; P8LE-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; P8LE-NEXT: rldicl r11, r11, 32, 32 +; P8LE-NEXT: subf r8, r11, r8 +; P8LE-NEXT: rldicl r0, r0, 32, 32 +; P8LE-NEXT: srwi r8, r8, 1 +; P8LE-NEXT: rldicl r30, r30, 32, 32 +; P8LE-NEXT: rldicl r4, r4, 32, 32 +; P8LE-NEXT: subf r9, r0, r9 +; P8LE-NEXT: add r8, r8, r11 +; P8LE-NEXT: subf r10, r30, r10 +; P8LE-NEXT: subf r11, r4, r12 +; P8LE-NEXT: srwi r9, r9, 1 +; P8LE-NEXT: srwi r8, r8, 6 +; P8LE-NEXT: srwi r10, r10, 1 +; P8LE-NEXT: srwi r11, r11, 1 +; P8LE-NEXT: add r9, r9, r0 +; P8LE-NEXT: add r10, r10, r30 +; P8LE-NEXT: add r4, r11, r4 +; P8LE-NEXT: srwi r9, r9, 6 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: mulli r8, r8, 95 +; P8LE-NEXT: srwi r10, r10, 6 +; P8LE-NEXT: srwi r4, r4, 6 +; P8LE-NEXT: mulli r9, r9, 95 +; P8LE-NEXT: mulli r10, r10, 95 +; P8LE-NEXT: mulli r4, r4, 95 +; P8LE-NEXT: subf r3, r8, r3 +; P8LE-NEXT: subf r6, r9, r6 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r10, r7 +; P8LE-NEXT: subf r4, r4, r5 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v5, v4 +; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: fold_urem_vec_2: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 22765 +; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: clrldi r8, r5, 32 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8BE-NEXT: clrldi r9, r6, 32 +; P8BE-NEXT: mulld r8, r8, r3 +; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 +; P8BE-NEXT: clrldi r10, r7, 32 +; P8BE-NEXT: mulld r9, r9, r3 +; P8BE-NEXT: clrldi r11, r4, 32 +; P8BE-NEXT: mulld r10, r10, r3 +; P8BE-NEXT: mulld r3, r11, r3 +; P8BE-NEXT: rldicl r8, r8, 32, 32 +; P8BE-NEXT: rldicl r9, r9, 32, 32 +; P8BE-NEXT: subf r11, r8, r5 +; P8BE-NEXT: rldicl r10, r10, 32, 32 +; P8BE-NEXT: subf r12, r9, r6 +; P8BE-NEXT: srwi r11, r11, 1 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: add r8, r11, r8 +; P8BE-NEXT: subf r11, r10, r7 +; P8BE-NEXT: srwi r12, r12, 1 +; P8BE-NEXT: add r9, r12, r9 +; P8BE-NEXT: subf r12, r3, r4 +; P8BE-NEXT: srwi r11, r11, 1 +; P8BE-NEXT: srwi r8, r8, 6 +; P8BE-NEXT: add r10, r11, r10 +; P8BE-NEXT: srwi r11, r12, 1 +; P8BE-NEXT: srwi r9, r9, 6 +; P8BE-NEXT: add r3, r11, r3 +; P8BE-NEXT: srwi r10, r10, 6 +; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: mulli r8, r8, 95 +; P8BE-NEXT: mulli r9, r9, 95 +; P8BE-NEXT: mulli r10, r10, 95 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: subf r5, r8, r5 +; P8BE-NEXT: subf r6, r9, r6 +; P8BE-NEXT: subf r7, r10, r7 +; P8BE-NEXT: subf r3, r3, r4 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: sldi r4, r7, 48 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v3, r6 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; P9LE-LABEL: combine_urem_udiv: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r5, r4, 95 +; P9LE-NEXT: subf r3, r5, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r5, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r7, r5, 32 +; P9LE-NEXT: mulld r7, r7, r6 +; P9LE-NEXT: rldicl r7, r7, 32, 32 +; P9LE-NEXT: subf r5, r7, r5 +; P9LE-NEXT: srwi r5, r5, 1 +; P9LE-NEXT: add r5, r5, r7 +; P9LE-NEXT: srwi r5, r5, 6 +; P9LE-NEXT: mulli r7, r5, 95 +; P9LE-NEXT: subf r3, r7, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r7, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r8, r7, 32 +; P9LE-NEXT: mulld r8, r8, r6 +; P9LE-NEXT: rldicl r8, r8, 32, 32 +; P9LE-NEXT: subf r7, r8, r7 +; P9LE-NEXT: srwi r7, r7, 1 +; P9LE-NEXT: add r7, r7, r8 +; P9LE-NEXT: srwi r7, r7, 6 +; P9LE-NEXT: mulli r8, r7, 95 +; P9LE-NEXT: subf r3, r8, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r8, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r9, r8, 32 +; P9LE-NEXT: mulld r6, r9, r6 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: subf r8, r6, r8 +; P9LE-NEXT: srwi r8, r8, 1 +; P9LE-NEXT: add r6, r8, r6 +; P9LE-NEXT: srwi r6, r6, 6 +; P9LE-NEXT: mulli r8, r6, 95 +; P9LE-NEXT: subf r3, r8, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: mtvsrd f0, r4 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r5 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r7 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r6 +; P9LE-NEXT: xxswapd v5, vs0 +; P9LE-NEXT: vmrglh v4, v5, v4 +; P9LE-NEXT: vmrglw v3, v4, v3 +; P9LE-NEXT: vadduhm v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: combine_urem_udiv: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9BE-NEXT: lis r6, 22765 +; P9BE-NEXT: ori r6, r6, 8969 +; P9BE-NEXT: clrldi r5, r4, 32 +; P9BE-NEXT: mulld r5, r5, r6 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: subf r4, r5, r4 +; P9BE-NEXT: srwi r4, r4, 1 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r5, r4, 95 +; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r5, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r7, r5, 32 +; P9BE-NEXT: mulld r7, r7, r6 +; P9BE-NEXT: rldicl r7, r7, 32, 32 +; P9BE-NEXT: subf r5, r7, r5 +; P9BE-NEXT: srwi r5, r5, 1 +; P9BE-NEXT: add r5, r5, r7 +; P9BE-NEXT: srwi r5, r5, 6 +; P9BE-NEXT: mulli r7, r5, 95 +; P9BE-NEXT: subf r3, r7, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r7, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r8, r7, 32 +; P9BE-NEXT: mulld r8, r8, r6 +; P9BE-NEXT: rldicl r8, r8, 32, 32 +; P9BE-NEXT: subf r7, r8, r7 +; P9BE-NEXT: srwi r7, r7, 1 +; P9BE-NEXT: add r7, r7, r8 +; P9BE-NEXT: srwi r7, r7, 6 +; P9BE-NEXT: mulli r8, r7, 95 +; P9BE-NEXT: subf r3, r8, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r8, r3, 32 +; P9BE-NEXT: mulld r6, r8, r6 +; P9BE-NEXT: rldicl r6, r6, 32, 32 +; P9BE-NEXT: subf r8, r6, r3 +; P9BE-NEXT: srwi r8, r8, 1 +; P9BE-NEXT: add r6, r8, r6 +; P9BE-NEXT: srwi r6, r6, 6 +; P9BE-NEXT: mulli r8, r6, 95 +; P9BE-NEXT: subf r3, r8, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: sldi r3, r4, 48 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: sldi r3, r5, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: sldi r3, r7, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: sldi r3, r6, 48 +; P9BE-NEXT: mtvsrd v5, r3 +; P9BE-NEXT: vmrghh v4, v5, v4 +; P9BE-NEXT: vmrghw v3, v4, v3 +; P9BE-NEXT: vadduhm v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: combine_urem_udiv: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r5, 22765 +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r5, r5, 8969 +; P8LE-NEXT: mfvsrd r6, f0 +; P8LE-NEXT: clrldi r3, r6, 48 +; P8LE-NEXT: rldicl r4, r6, 48, 48 +; P8LE-NEXT: rldicl r7, r6, 32, 48 +; P8LE-NEXT: rlwinm r8, r3, 0, 16, 31 +; P8LE-NEXT: rlwinm r9, r4, 0, 16, 31 +; P8LE-NEXT: rldicl r6, r6, 16, 48 +; P8LE-NEXT: rlwinm r10, r7, 0, 16, 31 +; P8LE-NEXT: clrldi r11, r8, 32 +; P8LE-NEXT: rlwinm r12, r6, 0, 16, 31 +; P8LE-NEXT: clrldi r0, r9, 32 +; P8LE-NEXT: clrldi r30, r10, 32 +; P8LE-NEXT: mulld r11, r11, r5 +; P8LE-NEXT: clrldi r29, r12, 32 +; P8LE-NEXT: mulld r0, r0, r5 +; P8LE-NEXT: mulld r30, r30, r5 +; P8LE-NEXT: mulld r5, r29, r5 +; P8LE-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; P8LE-NEXT: rldicl r11, r11, 32, 32 +; P8LE-NEXT: rldicl r0, r0, 32, 32 +; P8LE-NEXT: rldicl r30, r30, 32, 32 +; P8LE-NEXT: subf r8, r11, r8 +; P8LE-NEXT: rldicl r5, r5, 32, 32 +; P8LE-NEXT: subf r9, r0, r9 +; P8LE-NEXT: srwi r8, r8, 1 +; P8LE-NEXT: subf r10, r30, r10 +; P8LE-NEXT: add r8, r8, r11 +; P8LE-NEXT: srwi r9, r9, 1 +; P8LE-NEXT: srwi r10, r10, 1 +; P8LE-NEXT: subf r11, r5, r12 +; P8LE-NEXT: add r9, r9, r0 +; P8LE-NEXT: srwi r8, r8, 6 +; P8LE-NEXT: add r10, r10, r30 +; P8LE-NEXT: srwi r11, r11, 1 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: srwi r9, r9, 6 +; P8LE-NEXT: mulli r12, r8, 95 +; P8LE-NEXT: srwi r10, r10, 6 +; P8LE-NEXT: add r5, r11, r5 +; P8LE-NEXT: mtvsrd f0, r8 +; P8LE-NEXT: mulli r8, r9, 95 +; P8LE-NEXT: mtvsrd f1, r9 +; P8LE-NEXT: mulli r9, r10, 95 +; P8LE-NEXT: srwi r5, r5, 6 +; P8LE-NEXT: mtvsrd f3, r5 +; P8LE-NEXT: mulli r5, r5, 95 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: mtvsrd f2, r10 +; P8LE-NEXT: subf r3, r12, r3 +; P8LE-NEXT: xxswapd v6, vs3 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r9, r7 +; P8LE-NEXT: subf r4, r8, r4 +; P8LE-NEXT: xxswapd v1, vs2 +; P8LE-NEXT: mtvsrd f4, r3 +; P8LE-NEXT: subf r3, r5, r6 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: mtvsrd f5, r3 +; P8LE-NEXT: xxswapd v5, vs4 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: xxswapd v3, vs0 +; P8LE-NEXT: xxswapd v4, vs1 +; P8LE-NEXT: xxswapd v0, vs5 +; P8LE-NEXT: vmrglh v3, v4, v3 +; P8LE-NEXT: vmrglh v4, v0, v5 +; P8LE-NEXT: vmrglh v5, v6, v1 +; P8LE-NEXT: vmrglw v3, v4, v3 +; P8LE-NEXT: vmrglw v2, v5, v2 +; P8LE-NEXT: vadduhm v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: combine_urem_udiv: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r6, v2 +; P8BE-NEXT: lis r5, 22765 +; P8BE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8BE-NEXT: ori r5, r5, 8969 +; P8BE-NEXT: clrldi r3, r6, 48 +; P8BE-NEXT: rldicl r4, r6, 48, 48 +; P8BE-NEXT: rlwinm r8, r3, 0, 16, 31 +; P8BE-NEXT: rldicl r7, r6, 32, 48 +; P8BE-NEXT: rlwinm r9, r4, 0, 16, 31 +; P8BE-NEXT: rldicl r6, r6, 16, 48 +; P8BE-NEXT: clrldi r11, r8, 32 +; P8BE-NEXT: rlwinm r10, r7, 0, 16, 31 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: clrldi r12, r9, 32 +; P8BE-NEXT: mulld r11, r11, r5 +; P8BE-NEXT: clrldi r0, r10, 32 +; P8BE-NEXT: clrldi r30, r6, 32 +; P8BE-NEXT: mulld r12, r12, r5 +; P8BE-NEXT: mulld r0, r0, r5 +; P8BE-NEXT: mulld r5, r30, r5 +; P8BE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8BE-NEXT: rldicl r11, r11, 32, 32 +; P8BE-NEXT: rldicl r12, r12, 32, 32 +; P8BE-NEXT: subf r8, r11, r8 +; P8BE-NEXT: rldicl r5, r5, 32, 32 +; P8BE-NEXT: subf r9, r12, r9 +; P8BE-NEXT: srwi r8, r8, 1 +; P8BE-NEXT: rldicl r0, r0, 32, 32 +; P8BE-NEXT: add r8, r8, r11 +; P8BE-NEXT: srwi r9, r9, 1 +; P8BE-NEXT: subf r11, r5, r6 +; P8BE-NEXT: subf r10, r0, r10 +; P8BE-NEXT: add r9, r9, r12 +; P8BE-NEXT: srwi r8, r8, 6 +; P8BE-NEXT: srwi r11, r11, 1 +; P8BE-NEXT: srwi r10, r10, 1 +; P8BE-NEXT: srwi r9, r9, 6 +; P8BE-NEXT: add r5, r11, r5 +; P8BE-NEXT: mulli r12, r8, 95 +; P8BE-NEXT: add r10, r10, r0 +; P8BE-NEXT: srwi r5, r5, 6 +; P8BE-NEXT: mulli r11, r9, 95 +; P8BE-NEXT: sldi r9, r9, 48 +; P8BE-NEXT: srwi r10, r10, 6 +; P8BE-NEXT: sldi r8, r8, 48 +; P8BE-NEXT: mtvsrd v3, r9 +; P8BE-NEXT: mulli r9, r5, 95 +; P8BE-NEXT: mtvsrd v2, r8 +; P8BE-NEXT: mulli r8, r10, 95 +; P8BE-NEXT: sldi r10, r10, 48 +; P8BE-NEXT: subf r3, r12, r3 +; P8BE-NEXT: mtvsrd v4, r10 +; P8BE-NEXT: subf r4, r11, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: subf r3, r9, r6 +; P8BE-NEXT: subf r7, r8, r7 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: sldi r6, r7, 48 +; P8BE-NEXT: mtvsrd v1, r3 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: mtvsrd v0, r6 +; P8BE-NEXT: vmrghh v3, v5, v3 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v0, v1, v0 +; P8BE-NEXT: vmrghh v4, v5, v4 +; P8BE-NEXT: vmrghw v3, v0, v3 +; P8BE-NEXT: vmrghw v2, v4, v2 +; P8BE-NEXT: vadduhm v2, v3, v2 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_urem_power_of_two: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r3, r3, 0, 26, 31 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r3, r3, 0, 27, 31 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_urem_power_of_two: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 27, 31 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 26, 31 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: lis r5, 22765 +; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r5, r4, r3 +; P9BE-NEXT: srwi r5, r5, 1 +; P9BE-NEXT: add r4, r5, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_urem_power_of_two: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r3, 22765 +; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: rldicl r5, r4, 16, 48 +; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 +; P8LE-NEXT: clrldi r7, r6, 32 +; P8LE-NEXT: mulld r3, r7, r3 +; P8LE-NEXT: rldicl r7, r4, 48, 48 +; P8LE-NEXT: rlwinm r7, r7, 0, 27, 31 +; P8LE-NEXT: mtvsrd f1, r7 +; P8LE-NEXT: rldicl r3, r3, 32, 32 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: subf r6, r3, r6 +; P8LE-NEXT: srwi r6, r6, 1 +; P8LE-NEXT: add r3, r6, r3 +; P8LE-NEXT: clrldi r6, r4, 48 +; P8LE-NEXT: srwi r3, r3, 6 +; P8LE-NEXT: rldicl r4, r4, 32, 48 +; P8LE-NEXT: rlwinm r6, r6, 0, 26, 31 +; P8LE-NEXT: mulli r3, r3, 95 +; P8LE-NEXT: rlwinm r4, r4, 0, 29, 31 +; P8LE-NEXT: mtvsrd f0, r6 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: subf r3, r3, r5 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_urem_power_of_two: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 22765 +; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r7, r4, 16, 48 +; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rlwinm r7, r7, 0, 26, 31 +; P8BE-NEXT: clrldi r6, r5, 32 +; P8BE-NEXT: mulld r3, r6, r3 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: subf r6, r3, r5 +; P8BE-NEXT: srwi r6, r6, 1 +; P8BE-NEXT: add r3, r6, r3 +; P8BE-NEXT: rldicl r6, r4, 32, 48 +; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: rldicl r4, r4, 48, 48 +; P8BE-NEXT: rlwinm r6, r6, 0, 27, 31 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: rlwinm r4, r4, 0, 29, 31 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: sldi r6, r7, 48 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r6 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_urem_one: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: li r5, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: oris r6, r5, 45590 +; P9LE-NEXT: oris r5, r5, 51306 +; P9LE-NEXT: ori r6, r6, 17097 +; P9LE-NEXT: ori r5, r5, 30865 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: mulld r4, r4, r6 +; P9LE-NEXT: lis r6, 24749 +; P9LE-NEXT: ori r6, r6, 47143 +; P9LE-NEXT: rldicl r4, r4, 28, 36 +; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: mulld r4, r4, r6 +; P9LE-NEXT: rldicl r4, r4, 21, 43 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 31, 17, 31 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r4, r4, 24, 40 +; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_urem_one: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: lis r5, 24749 +; P9BE-NEXT: ori r5, r5, 47143 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: li r5, 0 +; P9BE-NEXT: oris r6, r5, 45590 +; P9BE-NEXT: oris r5, r5, 51306 +; P9BE-NEXT: ori r6, r6, 17097 +; P9BE-NEXT: ori r5, r5, 30865 +; P9BE-NEXT: rldicl r4, r4, 21, 43 +; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r6 +; P9BE-NEXT: rldicl r4, r4, 28, 36 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: clrlwi r4, r3, 16 +; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: rldicl r3, r3, 24, 40 +; P9BE-NEXT: mulli r3, r3, 654 +; P9BE-NEXT: subf r3, r3, r4 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: vmrghh v2, v4, v2 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_urem_one: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: li r3, 0 +; P8LE-NEXT: lis r8, 24749 +; P8LE-NEXT: xxlxor v5, v5, v5 +; P8LE-NEXT: oris r5, r3, 45590 +; P8LE-NEXT: ori r8, r8, 47143 +; P8LE-NEXT: oris r3, r3, 51306 +; P8LE-NEXT: ori r5, r5, 17097 +; P8LE-NEXT: ori r3, r3, 30865 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: rldicl r6, r4, 32, 48 +; P8LE-NEXT: rldicl r7, r4, 16, 48 +; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 +; P8LE-NEXT: rldicl r4, r4, 48, 48 +; P8LE-NEXT: mulld r5, r9, r5 +; P8LE-NEXT: rlwinm r9, r7, 0, 16, 31 +; P8LE-NEXT: mulld r8, r9, r8 +; P8LE-NEXT: rlwinm r9, r4, 31, 17, 31 +; P8LE-NEXT: mulld r3, r9, r3 +; P8LE-NEXT: rldicl r5, r5, 28, 36 +; P8LE-NEXT: rldicl r8, r8, 21, 43 +; P8LE-NEXT: mulli r5, r5, 23 +; P8LE-NEXT: rldicl r3, r3, 24, 40 +; P8LE-NEXT: mulli r8, r8, 5423 +; P8LE-NEXT: mulli r3, r3, 654 +; P8LE-NEXT: subf r5, r5, r6 +; P8LE-NEXT: subf r6, r8, r7 +; P8LE-NEXT: mtvsrd f0, r5 +; P8LE-NEXT: subf r3, r3, r4 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_urem_one: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: li r3, 0 +; P8BE-NEXT: lis r8, 24749 +; P8BE-NEXT: oris r6, r3, 51306 +; P8BE-NEXT: ori r8, r8, 47143 +; P8BE-NEXT: oris r3, r3, 45590 +; P8BE-NEXT: rldicl r5, r4, 32, 48 +; P8BE-NEXT: clrldi r7, r4, 48 +; P8BE-NEXT: ori r6, r6, 30865 +; P8BE-NEXT: ori r3, r3, 17097 +; P8BE-NEXT: rldicl r4, r4, 48, 48 +; P8BE-NEXT: rlwinm r9, r5, 31, 17, 31 +; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 +; P8BE-NEXT: mulld r6, r9, r6 +; P8BE-NEXT: clrldi r9, r7, 32 +; P8BE-NEXT: mulld r8, r9, r8 +; P8BE-NEXT: clrldi r9, r4, 32 +; P8BE-NEXT: mulld r3, r9, r3 +; P8BE-NEXT: li r9, 0 +; P8BE-NEXT: rldicl r6, r6, 24, 40 +; P8BE-NEXT: mulli r6, r6, 654 +; P8BE-NEXT: rldicl r8, r8, 21, 43 +; P8BE-NEXT: rldicl r3, r3, 28, 36 +; P8BE-NEXT: mulli r8, r8, 5423 +; P8BE-NEXT: mulli r3, r3, 23 +; P8BE-NEXT: subf r5, r6, r5 +; P8BE-NEXT: sldi r6, r9, 48 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: subf r6, r8, r7 +; P8BE-NEXT: mtvsrd v3, r5 +; P8BE-NEXT: subf r3, r3, r4 +; P8BE-NEXT: sldi r4, r6, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; P9LE-LABEL: dont_fold_urem_i64: +; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 25644 +; P9LE-NEXT: ori r4, r4, 34192 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 45590 +; P9LE-NEXT: mfvsrld r3, v3 +; P9LE-NEXT: ori r4, r4, 17097 +; P9LE-NEXT: mulhdu r4, r3, r4 +; P9LE-NEXT: sub r5, r3, r4 +; P9LE-NEXT: rldicl r5, r5, 63, 1 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: lis r5, -16037 +; P9LE-NEXT: rldicl r4, r4, 60, 4 +; P9LE-NEXT: ori r5, r5, 28749 +; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 52170 +; P9LE-NEXT: ori r5, r5, 12109 +; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: mfvsrd r4, v3 +; P9LE-NEXT: mulhdu r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 52, 12 +; P9LE-NEXT: mulli r5, r5, 5423 +; P9LE-NEXT: sub r4, r4, r5 +; P9LE-NEXT: lis r5, 25653 +; P9LE-NEXT: ori r5, r5, 15432 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: mtvsrdd v3, r4, r3 +; P9LE-NEXT: mfvsrd r3, v2 +; P9LE-NEXT: rldicl r4, r3, 63, 1 +; P9LE-NEXT: oris r5, r5, 1603 +; P9LE-NEXT: ori r5, r5, 21445 +; P9LE-NEXT: mulhdu r4, r4, r5 +; P9LE-NEXT: rldicl r4, r4, 57, 7 +; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: li r4, 0 +; P9LE-NEXT: mtvsrdd v2, r3, r4 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_urem_i64: +; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 25644 +; P9BE-NEXT: ori r4, r4, 34192 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 45590 +; P9BE-NEXT: mfvsrd r3, v3 +; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: mulhdu r4, r3, r4 +; P9BE-NEXT: sub r5, r3, r4 +; P9BE-NEXT: rldicl r5, r5, 63, 1 +; P9BE-NEXT: add r4, r5, r4 +; P9BE-NEXT: lis r5, -16037 +; P9BE-NEXT: rldicl r4, r4, 60, 4 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: ori r5, r5, 28749 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 52170 +; P9BE-NEXT: ori r5, r5, 12109 +; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: mfvsrld r4, v3 +; P9BE-NEXT: mulhdu r5, r4, r5 +; P9BE-NEXT: rldicl r5, r5, 52, 12 +; P9BE-NEXT: mulli r5, r5, 5423 +; P9BE-NEXT: sub r4, r4, r5 +; P9BE-NEXT: lis r5, 25653 +; P9BE-NEXT: ori r5, r5, 15432 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: mtvsrdd v3, r3, r4 +; P9BE-NEXT: mfvsrld r3, v2 +; P9BE-NEXT: rldicl r4, r3, 63, 1 +; P9BE-NEXT: oris r5, r5, 1603 +; P9BE-NEXT: ori r5, r5, 21445 +; P9BE-NEXT: mulhdu r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 57, 7 +; P9BE-NEXT: mulli r4, r4, 654 +; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: mtvsrdd v2, 0, r3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_urem_i64: +; P8LE: # %bb.0: +; P8LE-NEXT: lis r3, 25644 +; P8LE-NEXT: xxswapd vs0, v3 +; P8LE-NEXT: lis r4, -16037 +; P8LE-NEXT: lis r5, 25653 +; P8LE-NEXT: mfvsrd r6, v2 +; P8LE-NEXT: ori r3, r3, 34192 +; P8LE-NEXT: ori r4, r4, 28749 +; P8LE-NEXT: ori r5, r5, 15432 +; P8LE-NEXT: mfvsrd r8, v3 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: oris r3, r3, 45590 +; P8LE-NEXT: mfvsrd r7, f0 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: oris r4, r4, 52170 +; P8LE-NEXT: ori r3, r3, 17097 +; P8LE-NEXT: oris r5, r5, 1603 +; P8LE-NEXT: ori r4, r4, 12109 +; P8LE-NEXT: mulhdu r3, r7, r3 +; P8LE-NEXT: rldicl r9, r6, 63, 1 +; P8LE-NEXT: ori r5, r5, 21445 +; P8LE-NEXT: mulhdu r4, r8, r4 +; P8LE-NEXT: mulhdu r5, r9, r5 +; P8LE-NEXT: sub r9, r7, r3 +; P8LE-NEXT: rldicl r9, r9, 63, 1 +; P8LE-NEXT: rldicl r4, r4, 52, 12 +; P8LE-NEXT: add r3, r9, r3 +; P8LE-NEXT: rldicl r5, r5, 57, 7 +; P8LE-NEXT: mulli r4, r4, 5423 +; P8LE-NEXT: rldicl r3, r3, 60, 4 +; P8LE-NEXT: mulli r5, r5, 654 +; P8LE-NEXT: mulli r3, r3, 23 +; P8LE-NEXT: sub r4, r8, r4 +; P8LE-NEXT: sub r5, r6, r5 +; P8LE-NEXT: mtvsrd f0, r4 +; P8LE-NEXT: sub r3, r7, r3 +; P8LE-NEXT: li r4, 0 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxmrghd v3, vs0, vs2 +; P8LE-NEXT: xxmrghd v2, vs1, vs3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_urem_i64: +; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 25644 +; P8BE-NEXT: lis r4, -16037 +; P8BE-NEXT: xxswapd vs0, v3 +; P8BE-NEXT: xxswapd vs1, v2 +; P8BE-NEXT: lis r5, 25653 +; P8BE-NEXT: ori r3, r3, 34192 +; P8BE-NEXT: ori r4, r4, 28749 +; P8BE-NEXT: mfvsrd r6, v3 +; P8BE-NEXT: ori r5, r5, 15432 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: oris r3, r3, 45590 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: mfvsrd r7, f0 +; P8BE-NEXT: oris r4, r4, 52170 +; P8BE-NEXT: ori r3, r3, 17097 +; P8BE-NEXT: mfvsrd r8, f1 +; P8BE-NEXT: oris r5, r5, 1603 +; P8BE-NEXT: ori r4, r4, 12109 +; P8BE-NEXT: mulhdu r3, r6, r3 +; P8BE-NEXT: ori r5, r5, 21445 +; P8BE-NEXT: mulhdu r4, r7, r4 +; P8BE-NEXT: rldicl r9, r8, 63, 1 +; P8BE-NEXT: mulhdu r5, r9, r5 +; P8BE-NEXT: sub r9, r6, r3 +; P8BE-NEXT: rldicl r9, r9, 63, 1 +; P8BE-NEXT: rldicl r4, r4, 52, 12 +; P8BE-NEXT: add r3, r9, r3 +; P8BE-NEXT: mulli r4, r4, 5423 +; P8BE-NEXT: rldicl r5, r5, 57, 7 +; P8BE-NEXT: rldicl r3, r3, 60, 4 +; P8BE-NEXT: mulli r5, r5, 654 +; P8BE-NEXT: mulli r3, r3, 23 +; P8BE-NEXT: sub r4, r7, r4 +; P8BE-NEXT: mtvsrd f0, r4 +; P8BE-NEXT: sub r4, r8, r5 +; P8BE-NEXT: sub r3, r6, r3 +; P8BE-NEXT: mtvsrd f1, r4 +; P8BE-NEXT: li r4, 0 +; P8BE-NEXT: mtvsrd f2, r3 +; P8BE-NEXT: mtvsrd f3, r4 +; P8BE-NEXT: xxmrghd v3, vs2, vs0 +; P8BE-NEXT: xxmrghd v2, vs3, vs1 +; P8BE-NEXT: blr + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/trunk/test/CodeGen/RISCV/srem-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/RISCV/srem-lkk.ll +++ llvm/trunk/test/CodeGen/RISCV/srem-lkk.ll @@ -0,0 +1,583 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s + +define i32 @fold_srem_positive_odd(i32 %x) { +; RV32I-LABEL: fold_srem_positive_odd: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_positive_odd: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 706409 +; RV32IM-NEXT: addi a1, a1, 389 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: add a1, a1, a0 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 6 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_positive_odd: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_positive_odd: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: lui a1, 1045903 +; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1035 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -905 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -1767 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: add a1, a1, a0 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 6 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positive_even(i32 %x) { +; RV32I-LABEL: fold_srem_positive_even: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, 1060 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_positive_even: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 253241 +; RV32IM-NEXT: addi a1, a1, -15 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 8 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: addi a2, zero, 1060 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_positive_even: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: addi a1, zero, 1060 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_positive_even: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: lui a1, 506482 +; RV64IM-NEXT: addiw a1, a1, -31 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 711 +; RV64IM-NEXT: slli a1, a1, 19 +; RV64IM-NEXT: addi a1, a1, 1979 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 9 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, 1060 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; RV32I-LABEL: fold_srem_negative_odd: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, -723 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_negative_odd: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 677296 +; RV32IM-NEXT: addi a1, a1, -91 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 8 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: addi a2, zero, -723 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_negative_odd: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: addi a1, zero, -723 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_negative_odd: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: lui a1, 4781 +; RV64IM-NEXT: addiw a1, a1, 2045 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 1371 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -11 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -1355 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: sub a1, a1, a0 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 9 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, -723 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; RV32I-LABEL: fold_srem_negative_even: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: lui a1, 1048570 +; RV32I-NEXT: addi a1, a1, 1595 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_negative_even: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 1036895 +; RV32IM-NEXT: addi a1, a1, 999 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 8 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: lui a2, 1048570 +; RV32IM-NEXT: addi a2, a2, 1595 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_negative_even: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 1048570 +; RV64I-NEXT: addiw a1, a1, 1595 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_negative_even: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: lui a1, 1036895 +; RV64IM-NEXT: addiw a1, a1, 999 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, 11 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -523 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -481 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 12 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: lui a2, 1048570 +; RV64IM-NEXT: addiw a2, a2, 1595 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; RV32I-LABEL: combine_srem_sdiv: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: add a0, s1, a0 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: combine_srem_sdiv: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 706409 +; RV32IM-NEXT: addi a1, a1, 389 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: add a1, a1, a0 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 6 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a2, a1, a2 +; RV32IM-NEXT: sub a0, a0, a2 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: combine_srem_sdiv: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: .cfi_def_cfa_offset 32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: sext.w s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: addw a0, s1, a0 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: addi sp, sp, 32 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: combine_srem_sdiv: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 1045903 +; RV64IM-NEXT: addiw a2, a2, -733 +; RV64IM-NEXT: slli a2, a2, 15 +; RV64IM-NEXT: addi a2, a2, 1035 +; RV64IM-NEXT: slli a2, a2, 12 +; RV64IM-NEXT: addi a2, a2, -905 +; RV64IM-NEXT: slli a2, a2, 12 +; RV64IM-NEXT: addi a2, a2, -1767 +; RV64IM-NEXT: mulh a2, a1, a2 +; RV64IM-NEXT: add a1, a2, a1 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 6 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a2, a1, a2 +; RV64IM-NEXT: sub a0, a0, a2 +; RV64IM-NEXT: addw a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; RV32I-LABEL: dont_fold_srem_power_of_two: +; RV32I: # %bb.0: +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: srli a1, a1, 26 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: andi a1, a1, -64 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_power_of_two: +; RV32IM: # %bb.0: +; RV32IM-NEXT: srai a1, a0, 31 +; RV32IM-NEXT: srli a1, a1, 26 +; RV32IM-NEXT: add a1, a0, a1 +; RV32IM-NEXT: andi a1, a1, -64 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_power_of_two: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: srli a1, a1, 57 +; RV64I-NEXT: andi a1, a1, 63 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: addi a2, zero, 1 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: addi a2, a2, -64 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_power_of_two: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: srli a1, a1, 57 +; RV64IM-NEXT: andi a1, a1, 63 +; RV64IM-NEXT: add a1, a0, a1 +; RV64IM-NEXT: addi a2, zero, 1 +; RV64IM-NEXT: slli a2, a2, 32 +; RV64IM-NEXT: addi a2, a2, -64 +; RV64IM-NEXT: and a1, a1, a2 +; RV64IM-NEXT: subw a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: mv a0, zero +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; RV32I-LABEL: dont_fold_srem_i32_smax: +; RV32I: # %bb.0: +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: lui a2, 524288 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_i32_smax: +; RV32IM: # %bb.0: +; RV32IM-NEXT: srai a1, a0, 31 +; RV32IM-NEXT: srli a1, a1, 1 +; RV32IM-NEXT: add a1, a0, a1 +; RV32IM-NEXT: lui a2, 524288 +; RV32IM-NEXT: and a1, a1, a2 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_i32_smax: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: lui a2, 524288 +; RV64I-NEXT: addiw a2, a2, -1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: addi a2, zero, 1 +; RV64I-NEXT: slli a2, a2, 31 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_i32_smax: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: lui a2, 524288 +; RV64IM-NEXT: addiw a2, a2, -1 +; RV64IM-NEXT: and a1, a1, a2 +; RV64IM-NEXT: add a1, a0, a1 +; RV64IM-NEXT: addi a2, zero, 1 +; RV64IM-NEXT: slli a2, a2, 31 +; RV64IM-NEXT: and a1, a1, a2 +; RV64IM-NEXT: addw a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; RV32I-LABEL: dont_fold_srem_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a2, zero, 98 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_i64: +; RV32IM: # %bb.0: +; RV32IM-NEXT: addi sp, sp, -16 +; RV32IM-NEXT: .cfi_def_cfa_offset 16 +; RV32IM-NEXT: sw ra, 12(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: addi a2, zero, 98 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: lw ra, 12(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_i64: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lui a1, 2675 +; RV64IM-NEXT: addiw a1, a1, -251 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 1839 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 167 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 1505 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 5 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, 98 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i64 %x, 98 + ret i64 %1 +} Index: llvm/trunk/test/CodeGen/RISCV/srem-vector-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/RISCV/srem-vector-lkk.ll +++ llvm/trunk/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -0,0 +1,1689 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; RV32I-LABEL: fold_srem_vec_1: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: lh s2, 12(a1) +; RV32I-NEXT: lh s3, 8(a1) +; RV32I-NEXT: lh s0, 4(a1) +; RV32I-NEXT: lh a2, 0(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, -124 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 98 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, -1003 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: sh s0, 4(s1) +; RV32I-NEXT: sh s5, 2(s1) +; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_vec_1: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a6, 12(a1) +; RV32IM-NEXT: lh a3, 8(a1) +; RV32IM-NEXT: lh a4, 0(a1) +; RV32IM-NEXT: lh a1, 4(a1) +; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a5, a4, a5 +; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: srli a2, a5, 31 +; RV32IM-NEXT: srli a5, a5, 6 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: lui a4, 507375 +; RV32IM-NEXT: addi a4, a4, 1981 +; RV32IM-NEXT: mulh a4, a1, a4 +; RV32IM-NEXT: sub a4, a4, a1 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 6 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, -124 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: lui a4, 342392 +; RV32IM-NEXT: addi a4, a4, 669 +; RV32IM-NEXT: mulh a4, a3, a4 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 5 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, 98 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: lui a4, 780943 +; RV32IM-NEXT: addi a4, a4, 1809 +; RV32IM-NEXT: mulh a4, a6, a4 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 8 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, -1003 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a4, a6, a4 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_vec_1: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: .cfi_def_cfa_offset 64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: lh s2, 24(a1) +; RV64I-NEXT: lh s3, 16(a1) +; RV64I-NEXT: lh s0, 8(a1) +; RV64I-NEXT: lh a2, 0(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, -124 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, -1003 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: sh s0, 4(s1) +; RV64I-NEXT: sh s5, 2(s1) +; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_vec_1: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a6, 24(a1) +; RV64IM-NEXT: lh a3, 16(a1) +; RV64IM-NEXT: lh a4, 8(a1) +; RV64IM-NEXT: lh a1, 0(a1) +; RV64IM-NEXT: lui a5, 1045903 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -905 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a5, a1, a5 +; RV64IM-NEXT: add a5, a5, a1 +; RV64IM-NEXT: srli a2, a5, 63 +; RV64IM-NEXT: srli a5, a5, 6 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a2 +; RV64IM-NEXT: lui a2, 248 +; RV64IM-NEXT: addiw a2, a2, -1057 +; RV64IM-NEXT: slli a2, a2, 15 +; RV64IM-NEXT: addi a2, a2, -1057 +; RV64IM-NEXT: slli a2, a2, 15 +; RV64IM-NEXT: addi a2, a2, -1057 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi a2, a2, -265 +; RV64IM-NEXT: mulh a2, a4, a2 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: addi a5, zero, -124 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: lui a4, 2675 +; RV64IM-NEXT: addiw a4, a4, -251 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1839 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 167 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1505 +; RV64IM-NEXT: mulh a4, a3, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 5 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 98 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 1040212 +; RV64IM-NEXT: addiw a4, a4, 1977 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1907 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -453 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1213 +; RV64IM-NEXT: mulh a4, a6, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 7 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, -1003 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a4, a6, a4 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; RV32I-LABEL: fold_srem_vec_2: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: lh s2, 12(a1) +; RV32I-NEXT: lh s3, 8(a1) +; RV32I-NEXT: lh s0, 4(a1) +; RV32I-NEXT: lh a2, 0(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: sh s0, 4(s1) +; RV32I-NEXT: sh s5, 2(s1) +; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_vec_2: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a6, 12(a1) +; RV32IM-NEXT: lh a3, 8(a1) +; RV32IM-NEXT: lh a4, 0(a1) +; RV32IM-NEXT: lh a1, 4(a1) +; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a2, a4, a5 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a7, a2, 31 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: add a2, a2, a7 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: sub t0, a4, a2 +; RV32IM-NEXT: mulh a4, a1, a5 +; RV32IM-NEXT: add a4, a4, a1 +; RV32IM-NEXT: srli a2, a4, 31 +; RV32IM-NEXT: srli a4, a4, 6 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: sub a1, a1, a2 +; RV32IM-NEXT: mulh a2, a3, a5 +; RV32IM-NEXT: add a2, a2, a3 +; RV32IM-NEXT: srli a4, a2, 31 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: sub a2, a3, a2 +; RV32IM-NEXT: mulh a3, a6, a5 +; RV32IM-NEXT: add a3, a3, a6 +; RV32IM-NEXT: srli a4, a3, 31 +; RV32IM-NEXT: srli a3, a3, 6 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: mul a3, a3, a7 +; RV32IM-NEXT: sub a3, a6, a3 +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh t0, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_vec_2: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: .cfi_def_cfa_offset 64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: lh s2, 24(a1) +; RV64I-NEXT: lh s3, 16(a1) +; RV64I-NEXT: lh s0, 8(a1) +; RV64I-NEXT: lh a2, 0(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: sh s0, 4(s1) +; RV64I-NEXT: sh s5, 2(s1) +; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_vec_2: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a6, 24(a1) +; RV64IM-NEXT: lh a7, 16(a1) +; RV64IM-NEXT: lh a4, 8(a1) +; RV64IM-NEXT: lh a1, 0(a1) +; RV64IM-NEXT: lui a5, 1045903 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -905 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a2, a1, a5 +; RV64IM-NEXT: add a2, a2, a1 +; RV64IM-NEXT: srli a3, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a2, a2, a3 +; RV64IM-NEXT: addi a3, zero, 95 +; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: sub t0, a1, a2 +; RV64IM-NEXT: mulh a2, a4, a5 +; RV64IM-NEXT: add a2, a2, a4 +; RV64IM-NEXT: srli a1, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a1, a2, a1 +; RV64IM-NEXT: mul a1, a1, a3 +; RV64IM-NEXT: sub a1, a4, a1 +; RV64IM-NEXT: mulh a2, a7, a5 +; RV64IM-NEXT: add a2, a2, a7 +; RV64IM-NEXT: srli a4, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a2, a2, a4 +; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: sub a2, a7, a2 +; RV64IM-NEXT: mulh a4, a6, a5 +; RV64IM-NEXT: add a4, a4, a6 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 6 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: mul a3, a4, a3 +; RV64IM-NEXT: sub a3, a6, a3 +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh t0, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; RV32I-LABEL: combine_srem_sdiv: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: .cfi_def_cfa_offset 48 +; RV32I-NEXT: sw ra, 44(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s1, 36(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s3, 28(sp) +; RV32I-NEXT: sw s4, 24(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s6, 16(sp) +; RV32I-NEXT: sw s7, 12(sp) +; RV32I-NEXT: sw s8, 8(sp) +; RV32I-NEXT: sw s9, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: .cfi_offset s6, -32 +; RV32I-NEXT: .cfi_offset s7, -36 +; RV32I-NEXT: .cfi_offset s8, -40 +; RV32I-NEXT: .cfi_offset s9, -44 +; RV32I-NEXT: lh s2, 0(a1) +; RV32I-NEXT: lh s3, 4(a1) +; RV32I-NEXT: lh s4, 8(a1) +; RV32I-NEXT: lh s1, 12(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s8, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: add a0, s8, a0 +; RV32I-NEXT: add a1, s7, s1 +; RV32I-NEXT: add a2, s6, s4 +; RV32I-NEXT: add a3, s5, s9 +; RV32I-NEXT: sh a3, 6(s0) +; RV32I-NEXT: sh a2, 4(s0) +; RV32I-NEXT: sh a1, 2(s0) +; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: lw s8, 8(sp) +; RV32I-NEXT: lw s7, 12(sp) +; RV32I-NEXT: lw s6, 16(sp) +; RV32I-NEXT: lw s5, 20(sp) +; RV32I-NEXT: lw s4, 24(sp) +; RV32I-NEXT: lw s3, 28(sp) +; RV32I-NEXT: lw s2, 32(sp) +; RV32I-NEXT: lw s1, 36(sp) +; RV32I-NEXT: lw s0, 40(sp) +; RV32I-NEXT: lw ra, 44(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: .cfi_restore s6 +; RV32I-NEXT: .cfi_restore s7 +; RV32I-NEXT: .cfi_restore s8 +; RV32I-NEXT: .cfi_restore s9 +; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: combine_srem_sdiv: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a6, 0(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a4, 12(a1) +; RV32IM-NEXT: lh a1, 8(a1) +; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a2, a4, a5 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a7, a2, 31 +; RV32IM-NEXT: srai a2, a2, 6 +; RV32IM-NEXT: add t0, a2, a7 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, t0, a7 +; RV32IM-NEXT: sub t1, a4, a2 +; RV32IM-NEXT: mulh a4, a1, a5 +; RV32IM-NEXT: add a4, a4, a1 +; RV32IM-NEXT: srli a2, a4, 31 +; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: mul a4, a2, a7 +; RV32IM-NEXT: sub t2, a1, a4 +; RV32IM-NEXT: mulh a4, a3, a5 +; RV32IM-NEXT: add a4, a4, a3 +; RV32IM-NEXT: srli a1, a4, 31 +; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: add a1, a4, a1 +; RV32IM-NEXT: mul a4, a1, a7 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: mulh a4, a6, a5 +; RV32IM-NEXT: add a4, a4, a6 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: mul a5, a4, a7 +; RV32IM-NEXT: sub a5, a6, a5 +; RV32IM-NEXT: add a4, a5, a4 +; RV32IM-NEXT: add a1, a3, a1 +; RV32IM-NEXT: add a2, t2, a2 +; RV32IM-NEXT: add a3, t1, t0 +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a4, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: combine_srem_sdiv: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: .cfi_def_cfa_offset 96 +; RV64I-NEXT: sd ra, 88(sp) +; RV64I-NEXT: sd s0, 80(sp) +; RV64I-NEXT: sd s1, 72(sp) +; RV64I-NEXT: sd s2, 64(sp) +; RV64I-NEXT: sd s3, 56(sp) +; RV64I-NEXT: sd s4, 48(sp) +; RV64I-NEXT: sd s5, 40(sp) +; RV64I-NEXT: sd s6, 32(sp) +; RV64I-NEXT: sd s7, 24(sp) +; RV64I-NEXT: sd s8, 16(sp) +; RV64I-NEXT: sd s9, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: .cfi_offset s6, -64 +; RV64I-NEXT: .cfi_offset s7, -72 +; RV64I-NEXT: .cfi_offset s8, -80 +; RV64I-NEXT: .cfi_offset s9, -88 +; RV64I-NEXT: lh s2, 0(a1) +; RV64I-NEXT: lh s3, 8(a1) +; RV64I-NEXT: lh s4, 16(a1) +; RV64I-NEXT: lh s1, 24(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s6, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s7, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s8, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: mv s9, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: add a0, s8, a0 +; RV64I-NEXT: add a1, s7, s1 +; RV64I-NEXT: add a2, s6, s4 +; RV64I-NEXT: add a3, s5, s9 +; RV64I-NEXT: sh a3, 6(s0) +; RV64I-NEXT: sh a2, 4(s0) +; RV64I-NEXT: sh a1, 2(s0) +; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: ld s9, 8(sp) +; RV64I-NEXT: ld s8, 16(sp) +; RV64I-NEXT: ld s7, 24(sp) +; RV64I-NEXT: ld s6, 32(sp) +; RV64I-NEXT: ld s5, 40(sp) +; RV64I-NEXT: ld s4, 48(sp) +; RV64I-NEXT: ld s3, 56(sp) +; RV64I-NEXT: ld s2, 64(sp) +; RV64I-NEXT: ld s1, 72(sp) +; RV64I-NEXT: ld s0, 80(sp) +; RV64I-NEXT: ld ra, 88(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: .cfi_restore s6 +; RV64I-NEXT: .cfi_restore s7 +; RV64I-NEXT: .cfi_restore s8 +; RV64I-NEXT: .cfi_restore s9 +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: combine_srem_sdiv: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a6, 0(a1) +; RV64IM-NEXT: lh a7, 8(a1) +; RV64IM-NEXT: lh a4, 16(a1) +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: lui a5, 1045903 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -905 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a2, a1, a5 +; RV64IM-NEXT: add a2, a2, a1 +; RV64IM-NEXT: srli a3, a2, 63 +; RV64IM-NEXT: srai a2, a2, 6 +; RV64IM-NEXT: add t3, a2, a3 +; RV64IM-NEXT: addi t0, zero, 95 +; RV64IM-NEXT: mul a3, t3, t0 +; RV64IM-NEXT: sub t1, a1, a3 +; RV64IM-NEXT: mulh a3, a4, a5 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: srli a1, a3, 63 +; RV64IM-NEXT: srai a3, a3, 6 +; RV64IM-NEXT: add a1, a3, a1 +; RV64IM-NEXT: mul a3, a1, t0 +; RV64IM-NEXT: sub t2, a4, a3 +; RV64IM-NEXT: mulh a4, a7, a5 +; RV64IM-NEXT: add a4, a4, a7 +; RV64IM-NEXT: srli a3, a4, 63 +; RV64IM-NEXT: srai a4, a4, 6 +; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: mul a4, a3, t0 +; RV64IM-NEXT: sub a4, a7, a4 +; RV64IM-NEXT: mulh a5, a6, a5 +; RV64IM-NEXT: add a5, a5, a6 +; RV64IM-NEXT: srli a2, a5, 63 +; RV64IM-NEXT: srai a5, a5, 6 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: mul a5, a2, t0 +; RV64IM-NEXT: sub a5, a6, a5 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: add a1, t2, a1 +; RV64IM-NEXT: add a4, t1, t3 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_srem_power_of_two: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lh a2, 0(a1) +; RV32I-NEXT: lh a0, 12(a1) +; RV32I-NEXT: lh a3, 8(a1) +; RV32I-NEXT: lh a1, 4(a1) +; RV32I-NEXT: srai a4, a2, 31 +; RV32I-NEXT: srli a4, a4, 26 +; RV32I-NEXT: add a4, a2, a4 +; RV32I-NEXT: lui a6, 16 +; RV32I-NEXT: addi a5, a6, -64 +; RV32I-NEXT: and a4, a4, a5 +; RV32I-NEXT: sub s2, a2, a4 +; RV32I-NEXT: srai a2, a1, 31 +; RV32I-NEXT: srli a2, a2, 27 +; RV32I-NEXT: add a2, a1, a2 +; RV32I-NEXT: addi a4, a6, -32 +; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: sub s3, a1, a2 +; RV32I-NEXT: srai a1, a3, 31 +; RV32I-NEXT: srli a1, a1, 29 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: addi a2, a6, -8 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: sub s1, a3, a1 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: sh s2, 0(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_power_of_two: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a6, 8(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a4, 12(a1) +; RV32IM-NEXT: lh a1, 0(a1) +; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a5, a4, a5 +; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: srli a2, a5, 31 +; RV32IM-NEXT: srli a5, a5, 6 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a7, a4, a2 +; RV32IM-NEXT: srai a4, a1, 31 +; RV32IM-NEXT: srli a4, a4, 26 +; RV32IM-NEXT: add a4, a1, a4 +; RV32IM-NEXT: lui a5, 16 +; RV32IM-NEXT: addi a2, a5, -64 +; RV32IM-NEXT: and a2, a4, a2 +; RV32IM-NEXT: sub a1, a1, a2 +; RV32IM-NEXT: srai a2, a3, 31 +; RV32IM-NEXT: srli a2, a2, 27 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: addi a4, a5, -32 +; RV32IM-NEXT: and a2, a2, a4 +; RV32IM-NEXT: sub a2, a3, a2 +; RV32IM-NEXT: srai a3, a6, 31 +; RV32IM-NEXT: srli a3, a3, 29 +; RV32IM-NEXT: add a3, a6, a3 +; RV32IM-NEXT: addi a4, a5, -8 +; RV32IM-NEXT: and a3, a3, a4 +; RV32IM-NEXT: sub a3, a6, a3 +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a1, 0(a0) +; RV32IM-NEXT: sh a7, 6(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_power_of_two: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lh a2, 0(a1) +; RV64I-NEXT: lh a0, 24(a1) +; RV64I-NEXT: lh a3, 16(a1) +; RV64I-NEXT: lh a1, 8(a1) +; RV64I-NEXT: srai a4, a2, 63 +; RV64I-NEXT: srli a4, a4, 58 +; RV64I-NEXT: add a4, a2, a4 +; RV64I-NEXT: lui a6, 16 +; RV64I-NEXT: addiw a5, a6, -64 +; RV64I-NEXT: and a4, a4, a5 +; RV64I-NEXT: sub s2, a2, a4 +; RV64I-NEXT: srai a2, a1, 63 +; RV64I-NEXT: srli a2, a2, 59 +; RV64I-NEXT: add a2, a1, a2 +; RV64I-NEXT: addiw a4, a6, -32 +; RV64I-NEXT: and a2, a2, a4 +; RV64I-NEXT: sub s3, a1, a2 +; RV64I-NEXT: srai a1, a3, 63 +; RV64I-NEXT: srli a1, a1, 61 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: addiw a2, a6, -8 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub s1, a3, a1 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: sh s2, 0(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_power_of_two: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a6, 16(a1) +; RV64IM-NEXT: lh a3, 8(a1) +; RV64IM-NEXT: lh a4, 0(a1) +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: lui a5, 1045903 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -905 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a5, a1, a5 +; RV64IM-NEXT: add a5, a5, a1 +; RV64IM-NEXT: srli a2, a5, 63 +; RV64IM-NEXT: srli a5, a5, 6 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a7, a1, a2 +; RV64IM-NEXT: srai a2, a4, 63 +; RV64IM-NEXT: srli a2, a2, 58 +; RV64IM-NEXT: add a2, a4, a2 +; RV64IM-NEXT: lui a5, 16 +; RV64IM-NEXT: addiw a1, a5, -64 +; RV64IM-NEXT: and a1, a2, a1 +; RV64IM-NEXT: sub a1, a4, a1 +; RV64IM-NEXT: srai a2, a3, 63 +; RV64IM-NEXT: srli a2, a2, 59 +; RV64IM-NEXT: add a2, a3, a2 +; RV64IM-NEXT: addiw a4, a5, -32 +; RV64IM-NEXT: and a2, a2, a4 +; RV64IM-NEXT: sub a2, a3, a2 +; RV64IM-NEXT: srai a3, a6, 63 +; RV64IM-NEXT: srli a3, a3, 61 +; RV64IM-NEXT: add a3, a6, a3 +; RV64IM-NEXT: addiw a4, a5, -8 +; RV64IM-NEXT: and a3, a3, a4 +; RV64IM-NEXT: sub a3, a6, a3 +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a7, 6(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_srem_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: lh s2, 12(a1) +; RV32I-NEXT: lh s1, 8(a1) +; RV32I-NEXT: lh a2, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 654 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a1, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh zero, 0(s0) +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_one: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a2, 12(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a1, 8(a1) +; RV32IM-NEXT: lui a4, 820904 +; RV32IM-NEXT: addi a4, a4, -1903 +; RV32IM-NEXT: mulh a4, a3, a4 +; RV32IM-NEXT: add a4, a4, a3 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 9 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, 654 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: lui a4, 729444 +; RV32IM-NEXT: addi a4, a4, 713 +; RV32IM-NEXT: mulh a4, a1, a4 +; RV32IM-NEXT: add a4, a4, a1 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 4 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, 23 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: lui a4, 395996 +; RV32IM-NEXT: addi a4, a4, -2009 +; RV32IM-NEXT: mulh a4, a2, a4 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 11 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lui a5, 1 +; RV32IM-NEXT: addi a5, a5, 1327 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: sh zero, 0(a0) +; RV32IM-NEXT: sh a2, 6(a0) +; RV32IM-NEXT: sh a1, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: lh s2, 24(a1) +; RV64I-NEXT: lh s1, 16(a1) +; RV64I-NEXT: lh a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh zero, 0(s0) +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_one: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a2, 24(a1) +; RV64IM-NEXT: lh a3, 8(a1) +; RV64IM-NEXT: lh a1, 16(a1) +; RV64IM-NEXT: lui a4, 1043590 +; RV64IM-NEXT: addiw a4, a4, -1781 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1069 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1959 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 357 +; RV64IM-NEXT: mulh a4, a1, a4 +; RV64IM-NEXT: add a4, a4, a1 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: lui a4, 6413 +; RV64IM-NEXT: addiw a4, a4, 1265 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1027 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1077 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 965 +; RV64IM-NEXT: mulh a4, a3, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 8 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 654 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 12375 +; RV64IM-NEXT: addiw a4, a4, -575 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 883 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, -431 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 1959 +; RV64IM-NEXT: mulh a4, a2, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 11 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sh zero, 0(a0) +; RV64IM-NEXT: sh a2, 6(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_urem_i16_smax: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: lh a2, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lh s2, 12(a1) +; RV32I-NEXT: lh a0, 8(a1) +; RV32I-NEXT: slli a1, a2, 16 +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: srli a1, a1, 17 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: lui a3, 8 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: sub s3, a2, a1 +; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a1, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh zero, 0(s0) +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_i16_smax: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a2, 4(a1) +; RV32IM-NEXT: slli a6, a2, 16 +; RV32IM-NEXT: lh a4, 8(a1) +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: lui a5, 729444 +; RV32IM-NEXT: addi a5, a5, 713 +; RV32IM-NEXT: mulh a5, a4, a5 +; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: srli a3, a5, 31 +; RV32IM-NEXT: srli a5, a5, 4 +; RV32IM-NEXT: add a3, a5, a3 +; RV32IM-NEXT: addi a5, zero, 23 +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: sub a3, a4, a3 +; RV32IM-NEXT: lui a4, 395996 +; RV32IM-NEXT: addi a4, a4, -2009 +; RV32IM-NEXT: mulh a4, a1, a4 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 11 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lui a5, 1 +; RV32IM-NEXT: addi a5, a5, 1327 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: srai a4, a6, 31 +; RV32IM-NEXT: srli a4, a4, 17 +; RV32IM-NEXT: add a4, a2, a4 +; RV32IM-NEXT: lui a5, 8 +; RV32IM-NEXT: and a4, a4, a5 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: sh zero, 0(a0) +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_i16_smax: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: lh a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lh s2, 24(a1) +; RV64I-NEXT: lh a0, 16(a1) +; RV64I-NEXT: slli a1, a2, 48 +; RV64I-NEXT: srai a1, a1, 63 +; RV64I-NEXT: srli a1, a1, 49 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lui a3, 8 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: sub s3, a2, a1 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh zero, 0(s0) +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_i16_smax: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a2, 8(a1) +; RV64IM-NEXT: slli a6, a2, 48 +; RV64IM-NEXT: lh a4, 24(a1) +; RV64IM-NEXT: lh a1, 16(a1) +; RV64IM-NEXT: lui a5, 1043590 +; RV64IM-NEXT: addiw a5, a5, -1781 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1069 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1959 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 357 +; RV64IM-NEXT: mulh a5, a1, a5 +; RV64IM-NEXT: add a5, a5, a1 +; RV64IM-NEXT: srli a3, a5, 63 +; RV64IM-NEXT: srli a5, a5, 4 +; RV64IM-NEXT: add a3, a5, a3 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: sub a1, a1, a3 +; RV64IM-NEXT: lui a3, 12375 +; RV64IM-NEXT: addiw a3, a3, -575 +; RV64IM-NEXT: slli a3, a3, 12 +; RV64IM-NEXT: addi a3, a3, 883 +; RV64IM-NEXT: slli a3, a3, 13 +; RV64IM-NEXT: addi a3, a3, -431 +; RV64IM-NEXT: slli a3, a3, 12 +; RV64IM-NEXT: addi a3, a3, 1959 +; RV64IM-NEXT: mulh a3, a4, a3 +; RV64IM-NEXT: srli a5, a3, 63 +; RV64IM-NEXT: srli a3, a3, 11 +; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: sub a3, a4, a3 +; RV64IM-NEXT: srai a4, a6, 63 +; RV64IM-NEXT: srli a4, a4, 49 +; RV64IM-NEXT: add a4, a2, a4 +; RV64IM-NEXT: lui a5, 8 +; RV64IM-NEXT: and a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sh zero, 0(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; RV32I-LABEL: dont_fold_srem_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: .cfi_def_cfa_offset 48 +; RV32I-NEXT: sw ra, 44(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s1, 36(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s3, 28(sp) +; RV32I-NEXT: sw s4, 24(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s6, 16(sp) +; RV32I-NEXT: sw s7, 12(sp) +; RV32I-NEXT: sw s8, 8(sp) +; RV32I-NEXT: sw s9, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: .cfi_offset s6, -32 +; RV32I-NEXT: .cfi_offset s7, -36 +; RV32I-NEXT: .cfi_offset s8, -40 +; RV32I-NEXT: .cfi_offset s9, -44 +; RV32I-NEXT: lw s2, 24(a1) +; RV32I-NEXT: lw s3, 28(a1) +; RV32I-NEXT: lw s4, 16(a1) +; RV32I-NEXT: lw s5, 20(a1) +; RV32I-NEXT: lw s6, 8(a1) +; RV32I-NEXT: lw s1, 12(a1) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a2, zero, 1 +; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: mv s8, a1 +; RV32I-NEXT: addi a2, zero, 654 +; RV32I-NEXT: mv a0, s6 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: mv s9, a1 +; RV32I-NEXT: addi a2, zero, 23 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a2, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: sw a1, 28(s0) +; RV32I-NEXT: sw a0, 24(s0) +; RV32I-NEXT: sw s1, 20(s0) +; RV32I-NEXT: sw s4, 16(s0) +; RV32I-NEXT: sw s9, 12(s0) +; RV32I-NEXT: sw s6, 8(s0) +; RV32I-NEXT: sw s8, 4(s0) +; RV32I-NEXT: sw s7, 0(s0) +; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: lw s8, 8(sp) +; RV32I-NEXT: lw s7, 12(sp) +; RV32I-NEXT: lw s6, 16(sp) +; RV32I-NEXT: lw s5, 20(sp) +; RV32I-NEXT: lw s4, 24(sp) +; RV32I-NEXT: lw s3, 28(sp) +; RV32I-NEXT: lw s2, 32(sp) +; RV32I-NEXT: lw s1, 36(sp) +; RV32I-NEXT: lw s0, 40(sp) +; RV32I-NEXT: lw ra, 44(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: .cfi_restore s6 +; RV32I-NEXT: .cfi_restore s7 +; RV32I-NEXT: .cfi_restore s8 +; RV32I-NEXT: .cfi_restore s9 +; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_i64: +; RV32IM: # %bb.0: +; RV32IM-NEXT: addi sp, sp, -48 +; RV32IM-NEXT: .cfi_def_cfa_offset 48 +; RV32IM-NEXT: sw ra, 44(sp) +; RV32IM-NEXT: sw s0, 40(sp) +; RV32IM-NEXT: sw s1, 36(sp) +; RV32IM-NEXT: sw s2, 32(sp) +; RV32IM-NEXT: sw s3, 28(sp) +; RV32IM-NEXT: sw s4, 24(sp) +; RV32IM-NEXT: sw s5, 20(sp) +; RV32IM-NEXT: sw s6, 16(sp) +; RV32IM-NEXT: sw s7, 12(sp) +; RV32IM-NEXT: sw s8, 8(sp) +; RV32IM-NEXT: sw s9, 4(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: .cfi_offset s0, -8 +; RV32IM-NEXT: .cfi_offset s1, -12 +; RV32IM-NEXT: .cfi_offset s2, -16 +; RV32IM-NEXT: .cfi_offset s3, -20 +; RV32IM-NEXT: .cfi_offset s4, -24 +; RV32IM-NEXT: .cfi_offset s5, -28 +; RV32IM-NEXT: .cfi_offset s6, -32 +; RV32IM-NEXT: .cfi_offset s7, -36 +; RV32IM-NEXT: .cfi_offset s8, -40 +; RV32IM-NEXT: .cfi_offset s9, -44 +; RV32IM-NEXT: lw s2, 24(a1) +; RV32IM-NEXT: lw s3, 28(a1) +; RV32IM-NEXT: lw s4, 16(a1) +; RV32IM-NEXT: lw s5, 20(a1) +; RV32IM-NEXT: lw s6, 8(a1) +; RV32IM-NEXT: lw s1, 12(a1) +; RV32IM-NEXT: lw a3, 0(a1) +; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: addi a2, zero, 1 +; RV32IM-NEXT: mv a0, a3 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: mv s7, a0 +; RV32IM-NEXT: mv s8, a1 +; RV32IM-NEXT: addi a2, zero, 654 +; RV32IM-NEXT: mv a0, s6 +; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: mv s9, a1 +; RV32IM-NEXT: addi a2, zero, 23 +; RV32IM-NEXT: mv a0, s4 +; RV32IM-NEXT: mv a1, s5 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: mv s4, a0 +; RV32IM-NEXT: mv s1, a1 +; RV32IM-NEXT: lui a0, 1 +; RV32IM-NEXT: addi a2, a0, 1327 +; RV32IM-NEXT: mv a0, s2 +; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: sw a1, 28(s0) +; RV32IM-NEXT: sw a0, 24(s0) +; RV32IM-NEXT: sw s1, 20(s0) +; RV32IM-NEXT: sw s4, 16(s0) +; RV32IM-NEXT: sw s9, 12(s0) +; RV32IM-NEXT: sw s6, 8(s0) +; RV32IM-NEXT: sw s8, 4(s0) +; RV32IM-NEXT: sw s7, 0(s0) +; RV32IM-NEXT: lw s9, 4(sp) +; RV32IM-NEXT: lw s8, 8(sp) +; RV32IM-NEXT: lw s7, 12(sp) +; RV32IM-NEXT: lw s6, 16(sp) +; RV32IM-NEXT: lw s5, 20(sp) +; RV32IM-NEXT: lw s4, 24(sp) +; RV32IM-NEXT: lw s3, 28(sp) +; RV32IM-NEXT: lw s2, 32(sp) +; RV32IM-NEXT: lw s1, 36(sp) +; RV32IM-NEXT: lw s0, 40(sp) +; RV32IM-NEXT: lw ra, 44(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: .cfi_restore s0 +; RV32IM-NEXT: .cfi_restore s1 +; RV32IM-NEXT: .cfi_restore s2 +; RV32IM-NEXT: .cfi_restore s3 +; RV32IM-NEXT: .cfi_restore s4 +; RV32IM-NEXT: .cfi_restore s5 +; RV32IM-NEXT: .cfi_restore s6 +; RV32IM-NEXT: .cfi_restore s7 +; RV32IM-NEXT: .cfi_restore s8 +; RV32IM-NEXT: .cfi_restore s9 +; RV32IM-NEXT: addi sp, sp, 48 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: ld s2, 24(a1) +; RV64I-NEXT: ld s1, 16(a1) +; RV64I-NEXT: ld a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sd zero, 0(s0) +; RV64I-NEXT: sd a0, 24(s0) +; RV64I-NEXT: sd s1, 16(s0) +; RV64I-NEXT: sd s3, 8(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_i64: +; RV64IM: # %bb.0: +; RV64IM-NEXT: ld a2, 24(a1) +; RV64IM-NEXT: ld a3, 8(a1) +; RV64IM-NEXT: ld a1, 16(a1) +; RV64IM-NEXT: lui a4, 1043590 +; RV64IM-NEXT: addiw a4, a4, -1781 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1069 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1959 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 357 +; RV64IM-NEXT: mulh a4, a1, a4 +; RV64IM-NEXT: add a4, a4, a1 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srai a4, a4, 4 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: lui a4, 6413 +; RV64IM-NEXT: addiw a4, a4, 1265 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1027 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1077 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 965 +; RV64IM-NEXT: mulh a4, a3, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srai a4, a4, 8 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 654 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 12375 +; RV64IM-NEXT: addiw a4, a4, -575 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 883 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, -431 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 1959 +; RV64IM-NEXT: mulh a4, a2, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srai a4, a4, 11 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sd zero, 0(a0) +; RV64IM-NEXT: sd a2, 24(a0) +; RV64IM-NEXT: sd a3, 8(a0) +; RV64IM-NEXT: sd a1, 16(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/trunk/test/CodeGen/RISCV/urem-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/RISCV/urem-lkk.ll +++ llvm/trunk/test/CodeGen/RISCV/urem-lkk.ll @@ -0,0 +1,354 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s + +define i32 @fold_urem_positive_odd(i32 %x) { +; RV32I-LABEL: fold_urem_positive_odd: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_urem_positive_odd: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 364242 +; RV32IM-NEXT: addi a1, a1, 777 +; RV32IM-NEXT: mulhu a1, a0, a1 +; RV32IM-NEXT: sub a2, a0, a1 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: srli a1, a1, 6 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_urem_positive_odd: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_urem_positive_odd: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a0, a0, 32 +; RV64IM-NEXT: srli a0, a0, 32 +; RV64IM-NEXT: lui a1, 1423 +; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1035 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -1811 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, 561 +; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: sub a2, a0, a1 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a1, a2, a1 +; RV64IM-NEXT: srli a1, a1, 6 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positive_even(i32 %x) { +; RV32I-LABEL: fold_urem_positive_even: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, 1060 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_urem_positive_even: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 1012964 +; RV32IM-NEXT: addi a1, a1, -61 +; RV32IM-NEXT: mulhu a1, a0, a1 +; RV32IM-NEXT: srli a1, a1, 10 +; RV32IM-NEXT: addi a2, zero, 1060 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_urem_positive_even: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: addi a1, zero, 1060 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_urem_positive_even: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a0, a0, 32 +; RV64IM-NEXT: srli a0, a0, 32 +; RV64IM-NEXT: lui a1, 1048020 +; RV64IM-NEXT: addiw a1, a1, -1793 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, 139 +; RV64IM-NEXT: slli a1, a1, 14 +; RV64IM-NEXT: addi a1, a1, 1793 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -139 +; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: srli a1, a1, 10 +; RV64IM-NEXT: addi a2, zero, 1060 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; RV32I-LABEL: combine_urem_udiv: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: add a0, s1, a0 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: combine_urem_udiv: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 364242 +; RV32IM-NEXT: addi a1, a1, 777 +; RV32IM-NEXT: mulhu a1, a0, a1 +; RV32IM-NEXT: sub a2, a0, a1 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: srli a1, a1, 6 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a2, a1, a2 +; RV32IM-NEXT: sub a0, a0, a2 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: combine_urem_udiv: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: .cfi_def_cfa_offset 32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli s0, a0, 32 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: add a0, s1, a0 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: addi sp, sp, 32 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: combine_urem_udiv: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a0, a0, 32 +; RV64IM-NEXT: srli a0, a0, 32 +; RV64IM-NEXT: lui a1, 1423 +; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1035 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -1811 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, 561 +; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: sub a2, a0, a1 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a1, a2, a1 +; RV64IM-NEXT: srli a1, a1, 6 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a2, a1, a2 +; RV64IM-NEXT: sub a0, a0, a2 +; RV64IM-NEXT: add a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 63 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: mv a0, zero +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: # %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; RV32I-LABEL: dont_fold_urem_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a2, zero, 98 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_i64: +; RV32IM: # %bb.0: +; RV32IM-NEXT: addi sp, sp, -16 +; RV32IM-NEXT: .cfi_def_cfa_offset 16 +; RV32IM-NEXT: sw ra, 12(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: addi a2, zero, 98 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: lw ra, 12(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_i64: +; RV64IM: # %bb.0: +; RV64IM-NEXT: srli a1, a0, 1 +; RV64IM-NEXT: lui a2, 2675 +; RV64IM-NEXT: addiw a2, a2, -251 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi a2, a2, 1839 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi a2, a2, 167 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi a2, a2, 1505 +; RV64IM-NEXT: mulhu a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 4 +; RV64IM-NEXT: addi a2, zero, 98 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem i64 %x, 98 + ret i64 %1 +} Index: llvm/trunk/test/CodeGen/RISCV/urem-vector-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/RISCV/urem-vector-lkk.ll +++ llvm/trunk/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -0,0 +1,1419 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s + + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; RV32I-LABEL: fold_urem_vec_1: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s3, 8(a1) +; RV32I-NEXT: lhu s0, 4(a1) +; RV32I-NEXT: lhu a2, 0(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 124 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 98 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 1003 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: sh s0, 4(s1) +; RV32I-NEXT: sh s5, 2(s1) +; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_urem_vec_1: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a6, 12(a1) +; RV32IM-NEXT: lhu a3, 8(a1) +; RV32IM-NEXT: lhu a4, 0(a1) +; RV32IM-NEXT: lhu a1, 4(a1) +; RV32IM-NEXT: lui a5, 364242 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a5, a4, a5 +; RV32IM-NEXT: sub a2, a4, a5 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: srli a4, a1, 2 +; RV32IM-NEXT: lui a5, 135300 +; RV32IM-NEXT: addi a5, a5, 529 +; RV32IM-NEXT: mulhu a4, a4, a5 +; RV32IM-NEXT: srli a4, a4, 2 +; RV32IM-NEXT: addi a5, zero, 124 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: lui a4, 342392 +; RV32IM-NEXT: addi a4, a4, 669 +; RV32IM-NEXT: mulhu a4, a3, a4 +; RV32IM-NEXT: srli a4, a4, 5 +; RV32IM-NEXT: addi a5, zero, 98 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: lui a4, 267633 +; RV32IM-NEXT: addi a4, a4, -1809 +; RV32IM-NEXT: mulhu a4, a6, a4 +; RV32IM-NEXT: srli a4, a4, 8 +; RV32IM-NEXT: addi a5, zero, 1003 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a4, a6, a4 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_urem_vec_1: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: .cfi_def_cfa_offset 64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s3, 16(a1) +; RV64I-NEXT: lhu s0, 8(a1) +; RV64I-NEXT: lhu a2, 0(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 124 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 1003 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: sh s0, 4(s1) +; RV64I-NEXT: sh s5, 2(s1) +; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_urem_vec_1: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a6, 24(a1) +; RV64IM-NEXT: lhu a3, 16(a1) +; RV64IM-NEXT: lhu a4, 8(a1) +; RV64IM-NEXT: lhu a1, 0(a1) +; RV64IM-NEXT: lui a5, 1423 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, -1811 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a5, a1, a5 +; RV64IM-NEXT: sub a2, a1, a5 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a2 +; RV64IM-NEXT: srli a2, a4, 2 +; RV64IM-NEXT: lui a5, 264 +; RV64IM-NEXT: addiw a5, a5, 1057 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1057 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1057 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 133 +; RV64IM-NEXT: mulhu a2, a2, a5 +; RV64IM-NEXT: srli a2, a2, 3 +; RV64IM-NEXT: addi a5, zero, 124 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: srli a4, a3, 1 +; RV64IM-NEXT: lui a5, 2675 +; RV64IM-NEXT: addiw a5, a5, -251 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1839 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 167 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1505 +; RV64IM-NEXT: mulhu a4, a4, a5 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: addi a5, zero, 98 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 8364 +; RV64IM-NEXT: addiw a4, a4, -1977 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 1907 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 453 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 1213 +; RV64IM-NEXT: mulhu a4, a6, a4 +; RV64IM-NEXT: srli a4, a4, 7 +; RV64IM-NEXT: addi a5, zero, 1003 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a4, a6, a4 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; RV32I-LABEL: fold_urem_vec_2: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s3, 8(a1) +; RV32I-NEXT: lhu s0, 4(a1) +; RV32I-NEXT: lhu a2, 0(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: sh s0, 4(s1) +; RV32I-NEXT: sh s5, 2(s1) +; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_urem_vec_2: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a6, 12(a1) +; RV32IM-NEXT: lhu a7, 8(a1) +; RV32IM-NEXT: lhu a4, 0(a1) +; RV32IM-NEXT: lhu a1, 4(a1) +; RV32IM-NEXT: lui a5, 364242 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a2, a4, a5 +; RV32IM-NEXT: sub a3, a4, a2 +; RV32IM-NEXT: srli a3, a3, 1 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: addi a3, zero, 95 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: sub t0, a4, a2 +; RV32IM-NEXT: mulhu a4, a1, a5 +; RV32IM-NEXT: sub a2, a1, a4 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: sub a1, a1, a2 +; RV32IM-NEXT: mulhu a2, a7, a5 +; RV32IM-NEXT: sub a4, a7, a2 +; RV32IM-NEXT: srli a4, a4, 1 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: sub a2, a7, a2 +; RV32IM-NEXT: mulhu a4, a6, a5 +; RV32IM-NEXT: sub a5, a6, a4 +; RV32IM-NEXT: srli a5, a5, 1 +; RV32IM-NEXT: add a4, a5, a4 +; RV32IM-NEXT: srli a4, a4, 6 +; RV32IM-NEXT: mul a3, a4, a3 +; RV32IM-NEXT: sub a3, a6, a3 +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh t0, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_urem_vec_2: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: .cfi_def_cfa_offset 64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s3, 16(a1) +; RV64I-NEXT: lhu s0, 8(a1) +; RV64I-NEXT: lhu a2, 0(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: sh s0, 4(s1) +; RV64I-NEXT: sh s5, 2(s1) +; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_urem_vec_2: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a6, 24(a1) +; RV64IM-NEXT: lhu a7, 16(a1) +; RV64IM-NEXT: lhu a4, 8(a1) +; RV64IM-NEXT: lhu a1, 0(a1) +; RV64IM-NEXT: lui a5, 1423 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, -1811 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a2, a1, a5 +; RV64IM-NEXT: sub a3, a1, a2 +; RV64IM-NEXT: srli a3, a3, 1 +; RV64IM-NEXT: add a2, a3, a2 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: addi a3, zero, 95 +; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: sub t0, a1, a2 +; RV64IM-NEXT: mulhu a2, a4, a5 +; RV64IM-NEXT: sub a1, a4, a2 +; RV64IM-NEXT: srli a1, a1, 1 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 6 +; RV64IM-NEXT: mul a1, a1, a3 +; RV64IM-NEXT: sub a1, a4, a1 +; RV64IM-NEXT: mulhu a2, a7, a5 +; RV64IM-NEXT: sub a4, a7, a2 +; RV64IM-NEXT: srli a4, a4, 1 +; RV64IM-NEXT: add a2, a4, a2 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: sub a2, a7, a2 +; RV64IM-NEXT: mulhu a4, a6, a5 +; RV64IM-NEXT: sub a5, a6, a4 +; RV64IM-NEXT: srli a5, a5, 1 +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: srli a4, a4, 6 +; RV64IM-NEXT: mul a3, a4, a3 +; RV64IM-NEXT: sub a3, a6, a3 +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh t0, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; RV32I-LABEL: combine_urem_udiv: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: .cfi_def_cfa_offset 48 +; RV32I-NEXT: sw ra, 44(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s1, 36(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s3, 28(sp) +; RV32I-NEXT: sw s4, 24(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s6, 16(sp) +; RV32I-NEXT: sw s7, 12(sp) +; RV32I-NEXT: sw s8, 8(sp) +; RV32I-NEXT: sw s9, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: .cfi_offset s6, -32 +; RV32I-NEXT: .cfi_offset s7, -36 +; RV32I-NEXT: .cfi_offset s8, -40 +; RV32I-NEXT: .cfi_offset s9, -44 +; RV32I-NEXT: lhu s2, 0(a1) +; RV32I-NEXT: lhu s3, 4(a1) +; RV32I-NEXT: lhu s4, 8(a1) +; RV32I-NEXT: lhu s1, 12(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s8, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: add a0, s8, a0 +; RV32I-NEXT: add a1, s7, s1 +; RV32I-NEXT: add a2, s6, s4 +; RV32I-NEXT: add a3, s5, s9 +; RV32I-NEXT: sh a3, 6(s0) +; RV32I-NEXT: sh a2, 4(s0) +; RV32I-NEXT: sh a1, 2(s0) +; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: lw s8, 8(sp) +; RV32I-NEXT: lw s7, 12(sp) +; RV32I-NEXT: lw s6, 16(sp) +; RV32I-NEXT: lw s5, 20(sp) +; RV32I-NEXT: lw s4, 24(sp) +; RV32I-NEXT: lw s3, 28(sp) +; RV32I-NEXT: lw s2, 32(sp) +; RV32I-NEXT: lw s1, 36(sp) +; RV32I-NEXT: lw s0, 40(sp) +; RV32I-NEXT: lw ra, 44(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: .cfi_restore s6 +; RV32I-NEXT: .cfi_restore s7 +; RV32I-NEXT: .cfi_restore s8 +; RV32I-NEXT: .cfi_restore s9 +; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: combine_urem_udiv: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a6, 0(a1) +; RV32IM-NEXT: lhu a7, 4(a1) +; RV32IM-NEXT: lhu a4, 12(a1) +; RV32IM-NEXT: lhu a1, 8(a1) +; RV32IM-NEXT: lui a5, 364242 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a2, a4, a5 +; RV32IM-NEXT: sub a3, a4, a2 +; RV32IM-NEXT: srli a3, a3, 1 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: srli t3, a2, 6 +; RV32IM-NEXT: addi t0, zero, 95 +; RV32IM-NEXT: mul a3, t3, t0 +; RV32IM-NEXT: sub t1, a4, a3 +; RV32IM-NEXT: mulhu a4, a1, a5 +; RV32IM-NEXT: sub a3, a1, a4 +; RV32IM-NEXT: srli a3, a3, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: srli a3, a3, 6 +; RV32IM-NEXT: mul a4, a3, t0 +; RV32IM-NEXT: sub t2, a1, a4 +; RV32IM-NEXT: mulhu a4, a7, a5 +; RV32IM-NEXT: sub a1, a7, a4 +; RV32IM-NEXT: srli a1, a1, 1 +; RV32IM-NEXT: add a1, a1, a4 +; RV32IM-NEXT: srli a1, a1, 6 +; RV32IM-NEXT: mul a4, a1, t0 +; RV32IM-NEXT: sub a4, a7, a4 +; RV32IM-NEXT: mulhu a5, a6, a5 +; RV32IM-NEXT: sub a2, a6, a5 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: mul a5, a2, t0 +; RV32IM-NEXT: sub a5, a6, a5 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: add a1, a4, a1 +; RV32IM-NEXT: add a3, t2, a3 +; RV32IM-NEXT: add a4, t1, t3 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: combine_urem_udiv: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: .cfi_def_cfa_offset 96 +; RV64I-NEXT: sd ra, 88(sp) +; RV64I-NEXT: sd s0, 80(sp) +; RV64I-NEXT: sd s1, 72(sp) +; RV64I-NEXT: sd s2, 64(sp) +; RV64I-NEXT: sd s3, 56(sp) +; RV64I-NEXT: sd s4, 48(sp) +; RV64I-NEXT: sd s5, 40(sp) +; RV64I-NEXT: sd s6, 32(sp) +; RV64I-NEXT: sd s7, 24(sp) +; RV64I-NEXT: sd s8, 16(sp) +; RV64I-NEXT: sd s9, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: .cfi_offset s6, -64 +; RV64I-NEXT: .cfi_offset s7, -72 +; RV64I-NEXT: .cfi_offset s8, -80 +; RV64I-NEXT: .cfi_offset s9, -88 +; RV64I-NEXT: lhu s2, 0(a1) +; RV64I-NEXT: lhu s3, 8(a1) +; RV64I-NEXT: lhu s4, 16(a1) +; RV64I-NEXT: lhu s1, 24(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s6, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s7, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s8, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: mv s9, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: add a0, s8, a0 +; RV64I-NEXT: add a1, s7, s1 +; RV64I-NEXT: add a2, s6, s4 +; RV64I-NEXT: add a3, s5, s9 +; RV64I-NEXT: sh a3, 6(s0) +; RV64I-NEXT: sh a2, 4(s0) +; RV64I-NEXT: sh a1, 2(s0) +; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: ld s9, 8(sp) +; RV64I-NEXT: ld s8, 16(sp) +; RV64I-NEXT: ld s7, 24(sp) +; RV64I-NEXT: ld s6, 32(sp) +; RV64I-NEXT: ld s5, 40(sp) +; RV64I-NEXT: ld s4, 48(sp) +; RV64I-NEXT: ld s3, 56(sp) +; RV64I-NEXT: ld s2, 64(sp) +; RV64I-NEXT: ld s1, 72(sp) +; RV64I-NEXT: ld s0, 80(sp) +; RV64I-NEXT: ld ra, 88(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: .cfi_restore s6 +; RV64I-NEXT: .cfi_restore s7 +; RV64I-NEXT: .cfi_restore s8 +; RV64I-NEXT: .cfi_restore s9 +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: combine_urem_udiv: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a6, 0(a1) +; RV64IM-NEXT: lhu a7, 8(a1) +; RV64IM-NEXT: lhu a4, 16(a1) +; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: lui a5, 1423 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, -1811 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a2, a1, a5 +; RV64IM-NEXT: sub a3, a1, a2 +; RV64IM-NEXT: srli a3, a3, 1 +; RV64IM-NEXT: add a2, a3, a2 +; RV64IM-NEXT: srli t3, a2, 6 +; RV64IM-NEXT: addi t0, zero, 95 +; RV64IM-NEXT: mul a3, t3, t0 +; RV64IM-NEXT: sub t1, a1, a3 +; RV64IM-NEXT: mulhu a3, a4, a5 +; RV64IM-NEXT: sub a1, a4, a3 +; RV64IM-NEXT: srli a1, a1, 1 +; RV64IM-NEXT: add a1, a1, a3 +; RV64IM-NEXT: srli a1, a1, 6 +; RV64IM-NEXT: mul a3, a1, t0 +; RV64IM-NEXT: sub t2, a4, a3 +; RV64IM-NEXT: mulhu a4, a7, a5 +; RV64IM-NEXT: sub a3, a7, a4 +; RV64IM-NEXT: srli a3, a3, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: srli a3, a3, 6 +; RV64IM-NEXT: mul a4, a3, t0 +; RV64IM-NEXT: sub a4, a7, a4 +; RV64IM-NEXT: mulhu a5, a6, a5 +; RV64IM-NEXT: sub a2, a6, a5 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: mul a5, a2, t0 +; RV64IM-NEXT: sub a5, a6, a5 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: add a1, t2, a1 +; RV64IM-NEXT: add a4, t1, t3 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_urem_power_of_two: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: lhu s2, 8(a1) +; RV32I-NEXT: lhu s3, 4(a1) +; RV32I-NEXT: lhu s1, 0(a1) +; RV32I-NEXT: lhu a2, 12(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: andi a1, s1, 63 +; RV32I-NEXT: andi a2, s3, 31 +; RV32I-NEXT: andi a3, s2, 7 +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh a3, 4(s0) +; RV32I-NEXT: sh a2, 2(s0) +; RV32I-NEXT: sh a1, 0(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_power_of_two: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a6, 8(a1) +; RV32IM-NEXT: lhu a3, 4(a1) +; RV32IM-NEXT: lhu a4, 12(a1) +; RV32IM-NEXT: lhu a1, 0(a1) +; RV32IM-NEXT: lui a5, 364242 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a5, a4, a5 +; RV32IM-NEXT: sub a2, a4, a5 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: andi a1, a1, 63 +; RV32IM-NEXT: andi a3, a3, 31 +; RV32IM-NEXT: andi a4, a6, 7 +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a1, 0(a0) +; RV32IM-NEXT: sh a2, 6(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_power_of_two: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: lhu s2, 16(a1) +; RV64I-NEXT: lhu s3, 8(a1) +; RV64I-NEXT: lhu s1, 0(a1) +; RV64I-NEXT: lhu a2, 24(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: andi a1, s1, 63 +; RV64I-NEXT: andi a2, s3, 31 +; RV64I-NEXT: andi a3, s2, 7 +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh a3, 4(s0) +; RV64I-NEXT: sh a2, 2(s0) +; RV64I-NEXT: sh a1, 0(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_power_of_two: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a6, 16(a1) +; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a4, 0(a1) +; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: lui a5, 1423 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, -1811 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a5, a1, a5 +; RV64IM-NEXT: sub a2, a1, a5 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a2 +; RV64IM-NEXT: andi a2, a4, 63 +; RV64IM-NEXT: andi a3, a3, 31 +; RV64IM-NEXT: andi a4, a6, 7 +; RV64IM-NEXT: sh a4, 4(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_urem_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s1, 8(a1) +; RV32I-NEXT: lhu a2, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 654 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a1, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: sh zero, 0(s0) +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_one: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a2, 4(a1) +; RV32IM-NEXT: lhu a3, 12(a1) +; RV32IM-NEXT: lhu a1, 8(a1) +; RV32IM-NEXT: srli a4, a2, 1 +; RV32IM-NEXT: lui a5, 820904 +; RV32IM-NEXT: addi a5, a5, -1903 +; RV32IM-NEXT: mulhu a4, a4, a5 +; RV32IM-NEXT: srli a4, a4, 8 +; RV32IM-NEXT: addi a5, zero, 654 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: lui a4, 729444 +; RV32IM-NEXT: addi a4, a4, 713 +; RV32IM-NEXT: mulhu a4, a1, a4 +; RV32IM-NEXT: srli a4, a4, 4 +; RV32IM-NEXT: addi a5, zero, 23 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: lui a4, 395996 +; RV32IM-NEXT: addi a4, a4, -2009 +; RV32IM-NEXT: mulhu a4, a3, a4 +; RV32IM-NEXT: srli a4, a4, 11 +; RV32IM-NEXT: lui a5, 1 +; RV32IM-NEXT: addi a5, a5, 1327 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: sh zero, 0(a0) +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a1, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s1, 16(a1) +; RV64I-NEXT: lhu a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sh zero, 0(s0) +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_one: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a2, 24(a1) +; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a1, 16(a1) +; RV64IM-NEXT: lui a4, 3206 +; RV64IM-NEXT: addiw a4, a4, -1781 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1069 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1959 +; RV64IM-NEXT: slli a4, a4, 14 +; RV64IM-NEXT: addi a4, a4, 713 +; RV64IM-NEXT: mulhu a4, a1, a4 +; RV64IM-NEXT: sub a5, a1, a4 +; RV64IM-NEXT: srli a5, a5, 1 +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: srli a4, a3, 1 +; RV64IM-NEXT: lui a5, 6413 +; RV64IM-NEXT: addiw a5, a5, 1265 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1027 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1077 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 965 +; RV64IM-NEXT: mulhu a4, a4, a5 +; RV64IM-NEXT: srli a4, a4, 7 +; RV64IM-NEXT: addi a5, zero, 654 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 1044567 +; RV64IM-NEXT: addiw a4, a4, -575 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 883 +; RV64IM-NEXT: slli a4, a4, 14 +; RV64IM-NEXT: addi a4, a4, -861 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -179 +; RV64IM-NEXT: mulhu a4, a2, a4 +; RV64IM-NEXT: srli a4, a4, 12 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sh zero, 0(a0) +; RV64IM-NEXT: sh a2, 6(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; RV32I-LABEL: dont_fold_urem_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: .cfi_def_cfa_offset 48 +; RV32I-NEXT: sw ra, 44(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s1, 36(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s3, 28(sp) +; RV32I-NEXT: sw s4, 24(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s6, 16(sp) +; RV32I-NEXT: sw s7, 12(sp) +; RV32I-NEXT: sw s8, 8(sp) +; RV32I-NEXT: sw s9, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: .cfi_offset s6, -32 +; RV32I-NEXT: .cfi_offset s7, -36 +; RV32I-NEXT: .cfi_offset s8, -40 +; RV32I-NEXT: .cfi_offset s9, -44 +; RV32I-NEXT: lw s2, 24(a1) +; RV32I-NEXT: lw s3, 28(a1) +; RV32I-NEXT: lw s4, 16(a1) +; RV32I-NEXT: lw s5, 20(a1) +; RV32I-NEXT: lw s6, 8(a1) +; RV32I-NEXT: lw s1, 12(a1) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a2, zero, 1 +; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: mv s8, a1 +; RV32I-NEXT: addi a2, zero, 654 +; RV32I-NEXT: mv a0, s6 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: mv s9, a1 +; RV32I-NEXT: addi a2, zero, 23 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a2, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: sw a1, 28(s0) +; RV32I-NEXT: sw a0, 24(s0) +; RV32I-NEXT: sw s1, 20(s0) +; RV32I-NEXT: sw s4, 16(s0) +; RV32I-NEXT: sw s9, 12(s0) +; RV32I-NEXT: sw s6, 8(s0) +; RV32I-NEXT: sw s8, 4(s0) +; RV32I-NEXT: sw s7, 0(s0) +; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: lw s8, 8(sp) +; RV32I-NEXT: lw s7, 12(sp) +; RV32I-NEXT: lw s6, 16(sp) +; RV32I-NEXT: lw s5, 20(sp) +; RV32I-NEXT: lw s4, 24(sp) +; RV32I-NEXT: lw s3, 28(sp) +; RV32I-NEXT: lw s2, 32(sp) +; RV32I-NEXT: lw s1, 36(sp) +; RV32I-NEXT: lw s0, 40(sp) +; RV32I-NEXT: lw ra, 44(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: .cfi_restore s6 +; RV32I-NEXT: .cfi_restore s7 +; RV32I-NEXT: .cfi_restore s8 +; RV32I-NEXT: .cfi_restore s9 +; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_i64: +; RV32IM: # %bb.0: +; RV32IM-NEXT: addi sp, sp, -48 +; RV32IM-NEXT: .cfi_def_cfa_offset 48 +; RV32IM-NEXT: sw ra, 44(sp) +; RV32IM-NEXT: sw s0, 40(sp) +; RV32IM-NEXT: sw s1, 36(sp) +; RV32IM-NEXT: sw s2, 32(sp) +; RV32IM-NEXT: sw s3, 28(sp) +; RV32IM-NEXT: sw s4, 24(sp) +; RV32IM-NEXT: sw s5, 20(sp) +; RV32IM-NEXT: sw s6, 16(sp) +; RV32IM-NEXT: sw s7, 12(sp) +; RV32IM-NEXT: sw s8, 8(sp) +; RV32IM-NEXT: sw s9, 4(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: .cfi_offset s0, -8 +; RV32IM-NEXT: .cfi_offset s1, -12 +; RV32IM-NEXT: .cfi_offset s2, -16 +; RV32IM-NEXT: .cfi_offset s3, -20 +; RV32IM-NEXT: .cfi_offset s4, -24 +; RV32IM-NEXT: .cfi_offset s5, -28 +; RV32IM-NEXT: .cfi_offset s6, -32 +; RV32IM-NEXT: .cfi_offset s7, -36 +; RV32IM-NEXT: .cfi_offset s8, -40 +; RV32IM-NEXT: .cfi_offset s9, -44 +; RV32IM-NEXT: lw s2, 24(a1) +; RV32IM-NEXT: lw s3, 28(a1) +; RV32IM-NEXT: lw s4, 16(a1) +; RV32IM-NEXT: lw s5, 20(a1) +; RV32IM-NEXT: lw s6, 8(a1) +; RV32IM-NEXT: lw s1, 12(a1) +; RV32IM-NEXT: lw a3, 0(a1) +; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: addi a2, zero, 1 +; RV32IM-NEXT: mv a0, a3 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: mv s7, a0 +; RV32IM-NEXT: mv s8, a1 +; RV32IM-NEXT: addi a2, zero, 654 +; RV32IM-NEXT: mv a0, s6 +; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: mv s9, a1 +; RV32IM-NEXT: addi a2, zero, 23 +; RV32IM-NEXT: mv a0, s4 +; RV32IM-NEXT: mv a1, s5 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: mv s4, a0 +; RV32IM-NEXT: mv s1, a1 +; RV32IM-NEXT: lui a0, 1 +; RV32IM-NEXT: addi a2, a0, 1327 +; RV32IM-NEXT: mv a0, s2 +; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: sw a1, 28(s0) +; RV32IM-NEXT: sw a0, 24(s0) +; RV32IM-NEXT: sw s1, 20(s0) +; RV32IM-NEXT: sw s4, 16(s0) +; RV32IM-NEXT: sw s9, 12(s0) +; RV32IM-NEXT: sw s6, 8(s0) +; RV32IM-NEXT: sw s8, 4(s0) +; RV32IM-NEXT: sw s7, 0(s0) +; RV32IM-NEXT: lw s9, 4(sp) +; RV32IM-NEXT: lw s8, 8(sp) +; RV32IM-NEXT: lw s7, 12(sp) +; RV32IM-NEXT: lw s6, 16(sp) +; RV32IM-NEXT: lw s5, 20(sp) +; RV32IM-NEXT: lw s4, 24(sp) +; RV32IM-NEXT: lw s3, 28(sp) +; RV32IM-NEXT: lw s2, 32(sp) +; RV32IM-NEXT: lw s1, 36(sp) +; RV32IM-NEXT: lw s0, 40(sp) +; RV32IM-NEXT: lw ra, 44(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: .cfi_restore s0 +; RV32IM-NEXT: .cfi_restore s1 +; RV32IM-NEXT: .cfi_restore s2 +; RV32IM-NEXT: .cfi_restore s3 +; RV32IM-NEXT: .cfi_restore s4 +; RV32IM-NEXT: .cfi_restore s5 +; RV32IM-NEXT: .cfi_restore s6 +; RV32IM-NEXT: .cfi_restore s7 +; RV32IM-NEXT: .cfi_restore s8 +; RV32IM-NEXT: .cfi_restore s9 +; RV32IM-NEXT: addi sp, sp, 48 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: ld s2, 24(a1) +; RV64I-NEXT: ld s1, 16(a1) +; RV64I-NEXT: ld a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sd zero, 0(s0) +; RV64I-NEXT: sd a0, 24(s0) +; RV64I-NEXT: sd s1, 16(s0) +; RV64I-NEXT: sd s3, 8(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_i64: +; RV64IM: # %bb.0: +; RV64IM-NEXT: ld a2, 24(a1) +; RV64IM-NEXT: ld a3, 8(a1) +; RV64IM-NEXT: ld a1, 16(a1) +; RV64IM-NEXT: lui a4, 3206 +; RV64IM-NEXT: addiw a4, a4, -1781 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1069 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1959 +; RV64IM-NEXT: slli a4, a4, 14 +; RV64IM-NEXT: addi a4, a4, 713 +; RV64IM-NEXT: mulhu a4, a1, a4 +; RV64IM-NEXT: sub a5, a1, a4 +; RV64IM-NEXT: srli a5, a5, 1 +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: srli a4, a3, 1 +; RV64IM-NEXT: lui a5, 6413 +; RV64IM-NEXT: addiw a5, a5, 1265 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1027 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1077 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 965 +; RV64IM-NEXT: mulhu a4, a4, a5 +; RV64IM-NEXT: srli a4, a4, 7 +; RV64IM-NEXT: addi a5, zero, 654 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 1044567 +; RV64IM-NEXT: addiw a4, a4, -575 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 883 +; RV64IM-NEXT: slli a4, a4, 14 +; RV64IM-NEXT: addi a4, a4, -861 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -179 +; RV64IM-NEXT: mulhu a4, a2, a4 +; RV64IM-NEXT: srli a4, a4, 12 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sd zero, 0(a0) +; RV64IM-NEXT: sd a2, 24(a0) +; RV64IM-NEXT: sd a3, 8(a0) +; RV64IM-NEXT: sd a1, 16(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/trunk/test/CodeGen/X86/srem-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/srem-lkk.ll +++ llvm/trunk/test/CodeGen/X86/srem-lkk.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK + +define i32 @fold_srem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_srem_positive_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $-1401515643, %rax, %rcx # imm = 0xAC769185 +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: shrl $31, %edx +; CHECK-NEXT: sarl $6, %ecx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $95, %ecx, %ecx +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positive_even(i32 %x) { +; CHECK-LABEL: fold_srem_positive_even: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $1037275121, %rax, %rcx # imm = 0x3DD38FF1 +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: shrq $63, %rdx +; CHECK-NEXT: sarq $40, %rcx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $1060, %ecx, %ecx # imm = 0x424 +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; CHECK-LABEL: fold_srem_negative_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $-1520762971, %rax, %rcx # imm = 0xA55AFFA5 +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: shrq $63, %rdx +; CHECK-NEXT: sarq $40, %rcx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $-723, %ecx, %ecx # imm = 0xFD2D +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; CHECK-LABEL: fold_srem_negative_even: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $-47844377, %rax, %rcx # imm = 0xFD25F3E7 +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: shrq $63, %rdx +; CHECK-NEXT: sarq $40, %rcx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $-22981, %ecx, %ecx # imm = 0xA63B +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $-1401515643, %rax, %rcx # imm = 0xAC769185 +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: shrl $31, %edx +; CHECK-NEXT: sarl $6, %ecx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $95, %ecx, %edx +; CHECK-NEXT: subl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: leal 63(%rax), %ecx +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnsl %edi, %ecx +; CHECK-NEXT: andl $-64, %ecx +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_fold_srem_i32_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal 2147483647(%rdi), %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnsl %edi, %eax +; CHECK-NEXT: andl $-2147483648, %eax # imm = 0x80000000 +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: movabsq $6023426636313322977, %rcx # imm = 0x5397829CBC14E5E1 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: imulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq $63, %rax +; CHECK-NEXT: sarq $5, %rdx +; CHECK-NEXT: addq %rax, %rdx +; CHECK-NEXT: imulq $98, %rdx, %rax +; CHECK-NEXT: subq %rax, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq + %1 = srem i64 %x, 98 + ret i64 %1 +} Index: llvm/trunk/test/CodeGen/X86/srem-vector-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/srem-vector-lkk.ll +++ llvm/trunk/test/CodeGen/X86/srem-vector-lkk.ll @@ -0,0 +1,556 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; SSE-LABEL: fold_srem_vec_1: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $9, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 +; SSE-NEXT: shrl $16, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: movzwl %dx, %edx +; SSE-NEXT: movswl %dx, %esi +; SSE-NEXT: shrl $15, %edx +; SSE-NEXT: sarl $6, %esi +; SSE-NEXT: addl %edx, %esi +; SSE-NEXT: imull $95, %esi, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pextrw $1, %xmm0, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF +; SSE-NEXT: movl %edx, %esi +; SSE-NEXT: shrl $31, %esi +; SSE-NEXT: sarl $21, %edx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: imull $-124, %edx, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 +; SSE-NEXT: movl %edx, %esi +; SSE-NEXT: shrl $31, %esi +; SSE-NEXT: sarl $18, %edx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: imull $98, %edx, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fold_srem_vec_1: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: sarl $9, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 +; AVX-NEXT: shrl $16, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: movswl %dx, %esi +; AVX-NEXT: shrl $15, %edx +; AVX-NEXT: sarl $6, %esi +; AVX-NEXT: addl %edx, %esi +; AVX-NEXT: imull $95, %esi, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpextrw $1, %xmm0, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: shrl $31, %esi +; AVX-NEXT: sarl $21, %edx +; AVX-NEXT: addl %esi, %edx +; AVX-NEXT: imull $-124, %edx, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: shrl $31, %esi +; AVX-NEXT: sarl $18, %edx +; AVX-NEXT: addl %esi, %edx +; AVX-NEXT: imull $98, %edx, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 +; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; SSE-LABEL: fold_srem_vec_2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhw %xmm0, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: psraw $6, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fold_srem_vec_2: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; SSE-LABEL: combine_srem_sdiv: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhw %xmm0, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: psraw $6, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmullw %xmm1, %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_srem_sdiv: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; SSE-LABEL: dont_fold_srem_power_of_two: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: leal 31(%rax), %ecx +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: andl $-32, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: leal 63(%rcx), %edx +; SSE-NEXT: testw %cx, %cx +; SSE-NEXT: cmovnsl %ecx, %edx +; SSE-NEXT: andl $-64, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: leal 7(%rax), %ecx +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: andl $-8, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $6, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: imull $95, %edx, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_srem_power_of_two: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: leal 31(%rax), %ecx +; AVX-NEXT: testw %ax, %ax +; AVX-NEXT: cmovnsl %eax, %ecx +; AVX-NEXT: andl $-32, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: leal 63(%rcx), %edx +; AVX-NEXT: testw %cx, %cx +; AVX-NEXT: cmovnsl %ecx, %edx +; AVX-NEXT: andl $-64, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: leal 7(%rax), %ecx +; AVX-NEXT: testw %ax, %ax +; AVX-NEXT: cmovnsl %eax, %ecx +; AVX-NEXT: andl $-8, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: sarl $6, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: imull $95, %edx, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; SSE-LABEL: dont_fold_srem_one: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $4, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: leal (%rdx,%rdx,2), %ecx +; SSE-NEXT: shll $3, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B +; SSE-NEXT: movl %ecx, %esi +; SSE-NEXT: shrl $31, %esi +; SSE-NEXT: sarl $23, %ecx +; SSE-NEXT: addl %esi, %ecx +; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 +; SSE-NEXT: movl %ecx, %edx +; SSE-NEXT: shrl $31, %edx +; SSE-NEXT: sarl $26, %ecx +; SSE-NEXT: addl %edx, %ecx +; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_srem_one: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: sarl $4, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: leal (%rdx,%rdx,2), %ecx +; AVX-NEXT: shll $3, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: addl %eax, %edx +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B +; AVX-NEXT: movl %ecx, %esi +; AVX-NEXT: shrl $31, %esi +; AVX-NEXT: sarl $23, %ecx +; AVX-NEXT: addl %esi, %ecx +; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: shrl $31, %edx +; AVX-NEXT: sarl $26, %ecx +; AVX-NEXT: addl %edx, %ecx +; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; SSE-LABEL: dont_fold_urem_i16_smax: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $4, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: leal (%rdx,%rdx,2), %ecx +; SSE-NEXT: shll $3, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: leal 32767(%rax), %ecx +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 +; SSE-NEXT: movl %ecx, %edx +; SSE-NEXT: shrl $31, %edx +; SSE-NEXT: sarl $26, %ecx +; SSE-NEXT: addl %edx, %ecx +; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_i16_smax: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: sarl $4, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: leal (%rdx,%rdx,2), %ecx +; AVX-NEXT: shll $3, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: addl %eax, %edx +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: leal 32767(%rax), %ecx +; AVX-NEXT: testw %ax, %ax +; AVX-NEXT: cmovnsl %eax, %ecx +; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000 +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: shrl $31, %edx +; AVX-NEXT: sarl $26, %ecx +; AVX-NEXT: addl %edx, %ecx +; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; SSE-LABEL: dont_fold_srem_i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: imulq %rdx +; SSE-NEXT: addq %rcx, %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: shrq $63, %rax +; SSE-NEXT: sarq $4, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: leaq (%rdx,%rdx,2), %rax +; SSE-NEXT: shlq $3, %rax +; SSE-NEXT: subq %rax, %rdx +; SSE-NEXT: addq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: pextrq $1, %xmm2, %rcx +; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: imulq %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: shrq $63, %rax +; SSE-NEXT: sarq $11, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: imulq %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: shrq $63, %rax +; SSE-NEXT: sarq $8, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: retq +; +; AVX1-LABEL: dont_fold_srem_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: sarq $4, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: leaq (%rdx,%rdx,2), %rax +; AVX1-NEXT: shlq $3, %rax +; AVX1-NEXT: subq %rax, %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: sarq $11, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: sarq $8, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_fold_srem_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: sarq $4, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax +; AVX2-NEXT: shlq $3, %rax +; AVX2-NEXT: subq %rax, %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: sarq $11, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: sarq $8, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/trunk/test/CodeGen/X86/urem-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/urem-lkk.ll +++ llvm/trunk/test/CodeGen/X86/urem-lkk.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK + +define i32 @fold_urem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_urem_positive_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: imulq $1491936009, %rcx, %rcx # imm = 0x58ED2309 +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: subl %ecx, %edx +; CHECK-NEXT: shrl %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: shrl $6, %edx +; CHECK-NEXT: imull $95, %edx, %ecx +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positive_even(i32 %x) { +; CHECK-LABEL: fold_urem_positive_even: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: movl $4149100483, %edx # imm = 0xF74E3FC3 +; CHECK-NEXT: imulq %rcx, %rdx +; CHECK-NEXT: shrq $42, %rdx +; CHECK-NEXT: imull $1060, %edx, %ecx # imm = 0x424 +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: imulq $1491936009, %rax, %rax # imm = 0x58ED2309 +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: subl %eax, %ecx +; CHECK-NEXT: shrl %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: shrl $6, %ecx +; CHECK-NEXT: imull $95, %ecx, %eax +; CHECK-NEXT: subl %eax, %edi +; CHECK-NEXT: leal (%rdi,%rcx), %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: andl $63, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: movabsq $6023426636313322977, %rcx # imm = 0x5397829CBC14E5E1 +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: shrq $4, %rdx +; CHECK-NEXT: imulq $98, %rdx, %rax +; CHECK-NEXT: subq %rax, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq + %1 = urem i64 %x, 98 + ret i64 %1 +} Index: llvm/trunk/test/CodeGen/X86/urem-vector-lkk.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/urem-vector-lkk.ll +++ llvm/trunk/test/CodeGen/X86/urem-vector-lkk.ll @@ -0,0 +1,378 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; SSE-LABEL: fold_urem_vec_1: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shrl $2, %ecx +; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 +; SSE-NEXT: shrl $19, %ecx +; SSE-NEXT: imull $124, %ecx, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: movzwl %cx, %edx +; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 +; SSE-NEXT: shrl $22, %edx +; SSE-NEXT: imull $95, %edx, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shrl %ecx +; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 +; SSE-NEXT: shrl $17, %ecx +; SSE-NEXT: imull $98, %ecx, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: movl %eax, %edx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: movzwl %dx, %edx +; SSE-NEXT: shrl %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: shrl $9, %edx +; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fold_urem_vec_1: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $2, %ecx +; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 +; AVX-NEXT: shrl $19, %ecx +; AVX-NEXT: imull $124, %ecx, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: movzwl %cx, %edx +; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 +; AVX-NEXT: shrl $22, %edx +; AVX-NEXT: imull $95, %edx, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl %ecx +; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 +; AVX-NEXT: shrl $17, %ecx +; AVX-NEXT: imull $98, %ecx, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: shrl %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: shrl $9, %edx +; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; SSE-LABEL: fold_urem_vec_2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: psrlw $6, %xmm1 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fold_urem_vec_2: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; SSE-LABEL: combine_urem_udiv: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: psrlw $6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmullw %xmm1, %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_urem_udiv: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; SSE-LABEL: dont_fold_urem_power_of_two: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; SSE-NEXT: shrl $22, %ecx +; SSE-NEXT: imull $95, %ecx, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pextrw $1, %xmm0, %ecx +; SSE-NEXT: andl $31, %ecx +; SSE-NEXT: movd %xmm0, %edx +; SSE-NEXT: andl $63, %edx +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %ecx +; SSE-NEXT: andl $7, %ecx +; SSE-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_power_of_two: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; AVX-NEXT: shrl $22, %ecx +; AVX-NEXT: imull $95, %ecx, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpextrw $1, %xmm0, %ecx +; AVX-NEXT: andl $31, %ecx +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: andl $63, %edx +; AVX-NEXT: vmovd %edx, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-NEXT: andl $7, %ecx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 +; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { +; SSE-LABEL: dont_fold_urem_one: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: movl %eax, %edx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: movzwl %dx, %edx +; SSE-NEXT: shrl %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: shrl $4, %edx +; SSE-NEXT: leal (%rdx,%rdx,2), %ecx +; SSE-NEXT: shll $3, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B +; SSE-NEXT: shrl $25, %ecx +; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 +; SSE-NEXT: shrl $26, %ecx +; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_one: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: shrl %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: shrl $4, %edx +; AVX-NEXT: leal (%rdx,%rdx,2), %ecx +; AVX-NEXT: shll $3, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: addl %eax, %edx +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B +; AVX-NEXT: shrl $25, %ecx +; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 +; AVX-NEXT: shrl $26, %ecx +; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; SSE-LABEL: dont_fold_urem_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: mulq %rdx +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: subq %rdx, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: addq %rdx, %rax +; SSE-NEXT: shrq $4, %rax +; SSE-NEXT: leaq (%rax,%rax,2), %rdx +; SSE-NEXT: shlq $3, %rdx +; SSE-NEXT: subq %rdx, %rax +; SSE-NEXT: addq %rcx, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: mulq %rdx +; SSE-NEXT: shrq $12, %rdx +; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE-NEXT: mulq %rdx +; SSE-NEXT: shrq $7, %rdx +; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: dont_fold_urem_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: mulq %rdx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: addq %rdx, %rax +; AVX1-NEXT: shrq $4, %rax +; AVX1-NEXT: leaq (%rax,%rax,2), %rdx +; AVX1-NEXT: shlq $3, %rdx +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: mulq %rdx +; AVX1-NEXT: shrq $12, %rdx +; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX1-NEXT: mulq %rdx +; AVX1-NEXT: shrq $7, %rdx +; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_fold_urem_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: mulq %rdx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: shrq %rax +; AVX2-NEXT: addq %rdx, %rax +; AVX2-NEXT: shrq $4, %rax +; AVX2-NEXT: leaq (%rax,%rax,2), %rdx +; AVX2-NEXT: shlq $3, %rdx +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: mulq %rdx +; AVX2-NEXT: shrq $12, %rdx +; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq %rax +; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX2-NEXT: mulq %rdx +; AVX2-NEXT: shrq $7, %rdx +; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} \ No newline at end of file