Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14741,6 +14741,9 @@ case Instruction::Mul: return true; case Instruction::Sub: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: return Operand == 1; default: return false; Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4068,6 +4068,18 @@ defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>; defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>; + +def splat32_negated : PatFrag<(ops node:$val), + (sub (ARMvmovImm 0), (ARMvdup node:$val))>; +def splat16_negated : PatFrag<(ops node:$val), + (sub (v8i16 (bitconvert (v4i32 (ARMvmovImm 0)))), + (ARMvdup node:$val))>; +def splat8_negated : PatFrag<(ops node:$val), + (sub (v16i8 (bitconvert (v4i32 (ARMvmovImm 0)))), + (ARMvdup node:$val))>; +def negate_scalar : OutPatFrag<(ops node:$val), + (t2RSBri $val, 0)>; + let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>; @@ -4082,6 +4094,20 @@ (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>; def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + + def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (splat32_negated GPR:$Rm)))), + (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), (negate_scalar GPR:$Rm)))>; + def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (splat16_negated GPR:$Rm)))), + (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), (negate_scalar GPR:$Rm)))>; + def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (splat8_negated GPR:$Rm)))), + (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), (negate_scalar GPR:$Rm)))>; + + def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (splat32_negated GPR:$Rm))), + (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), (negate_scalar GPR:$Rm)))>; + def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (splat16_negated GPR:$Rm)))), + (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), (negate_scalar GPR:$Rm)))>; + def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (splat8_negated GPR:$Rm)))), + (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), (negate_scalar GPR:$Rm)))>; } class MVE_VBRSR size, list pattern=[]> Index: llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll @@ -0,0 +1,422 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s + +define dso_local arm_aapcs_vfpcc void @sink_shl_i32(i32* nocapture readonly %in, i32* noalias nocapture %out, i32 %shift, i32 %N) { +; CHECK-LABEL: sink_shl_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r0, #16 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r1, #16 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! +; CHECK-NEXT: vshl.u32 q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1, #16]! +; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.ph + +vector.ph: + %n.vec = and i32 %N, -4 + %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0 + %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.in = getelementptr inbounds i32, i32* %in, i32 %index + %cast.in = bitcast i32* %gep.in to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %cast.in, align 4 + %res = shl <4 x i32> %wide.load, %broadcast.splat11 + %gep.out = getelementptr inbounds i32, i32* %out, i32 %index + %cast.out = bitcast i32* %gep.out to <4 x i32>* + store <4 x i32> %res, <4 x i32>* %cast.out, align 4 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %n.vec + br i1 %cmp, label %exit, label %vector.body + +exit: + ret void +} + +define dso_local arm_aapcs_vfpcc void @sink_shl_i16(i16* nocapture readonly %in, i16* noalias nocapture %out, i16 %shift, i32 %N) { +; CHECK-LABEL: sink_shl_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r0, #8 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r1, #8 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #8]! +; CHECK-NEXT: vshl.u16 q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1, #8]! +; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.ph + +vector.ph: + %n.vec = and i32 %N, -4 + %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0 + %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.in = getelementptr inbounds i16, i16* %in, i32 %index + %cast.in = bitcast i16* %gep.in to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %cast.in, align 4 + %res = shl <8 x i16> %wide.load, %broadcast.splat11 + %gep.out = getelementptr inbounds i16, i16* %out, i32 %index + %cast.out = bitcast i16* %gep.out to <8 x i16>* + store <8 x i16> %res, <8 x i16>* %cast.out, align 4 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %n.vec + br i1 %cmp, label %exit, label %vector.body + +exit: + ret void +} + +define dso_local arm_aapcs_vfpcc void @sink_shl_i8(i8* nocapture readonly %in, i8* noalias nocapture %out, i8 %shift, i32 %N) { +; CHECK-LABEL: sink_shl_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #4]! +; CHECK-NEXT: vshl.u8 q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1, #4]! +; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.ph + +vector.ph: + %n.vec = and i32 %N, -4 + %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0 + %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.in = getelementptr inbounds i8, i8* %in, i32 %index + %cast.in = bitcast i8* %gep.in to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %cast.in, align 4 + %res = shl <16 x i8> %wide.load, %broadcast.splat11 + %gep.out = getelementptr inbounds i8, i8* %out, i32 %index + %cast.out = bitcast i8* %gep.out to <16 x i8>* + store <16 x i8> %res, <16 x i8>* %cast.out, align 4 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %n.vec + br i1 %cmp, label %exit, label %vector.body + +exit: + ret void +} + +define dso_local arm_aapcs_vfpcc void @sink_lshr_i32(i32* nocapture readonly %in, i32* noalias nocapture %out, i32 %shift, i32 %N) { +; CHECK-LABEL: sink_lshr_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r0, #16 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r1, #16 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! +; CHECK-NEXT: vshl.u32 q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1, #16]! +; CHECK-NEXT: le lr, .LBB3_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.ph + +vector.ph: + %n.vec = and i32 %N, -4 + %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0 + %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.in = getelementptr inbounds i32, i32* %in, i32 %index + %cast.in = bitcast i32* %gep.in to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %cast.in, align 4 + %res = lshr <4 x i32> %wide.load, %broadcast.splat11 + %gep.out = getelementptr inbounds i32, i32* %out, i32 %index + %cast.out = bitcast i32* %gep.out to <4 x i32>* + store <4 x i32> %res, <4 x i32>* %cast.out, align 4 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %n.vec + br i1 %cmp, label %exit, label %vector.body + +exit: + ret void +} + +define dso_local arm_aapcs_vfpcc void @sink_lshr_i16(i16* nocapture readonly %in, i16* noalias nocapture %out, i16 %shift, i32 %N) { +; CHECK-LABEL: sink_lshr_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r0, #8 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r1, #8 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #8]! +; CHECK-NEXT: vshl.u16 q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1, #8]! +; CHECK-NEXT: le lr, .LBB4_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.ph + +vector.ph: + %n.vec = and i32 %N, -4 + %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0 + %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.in = getelementptr inbounds i16, i16* %in, i32 %index + %cast.in = bitcast i16* %gep.in to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %cast.in, align 4 + %res = lshr <8 x i16> %wide.load, %broadcast.splat11 + %gep.out = getelementptr inbounds i16, i16* %out, i32 %index + %cast.out = bitcast i16* %gep.out to <8 x i16>* + store <8 x i16> %res, <8 x i16>* %cast.out, align 4 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %n.vec + br i1 %cmp, label %exit, label %vector.body + +exit: + ret void +} + +define dso_local arm_aapcs_vfpcc void @sink_lshr_i8(i8* nocapture readonly %in, i8* noalias nocapture %out, i8 %shift, i32 %N) { +; CHECK-LABEL: sink_lshr_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #4]! +; CHECK-NEXT: vshl.u8 q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1, #4]! +; CHECK-NEXT: le lr, .LBB5_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.ph + +vector.ph: + %n.vec = and i32 %N, -4 + %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0 + %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.in = getelementptr inbounds i8, i8* %in, i32 %index + %cast.in = bitcast i8* %gep.in to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %cast.in, align 4 + %res = lshr <16 x i8> %wide.load, %broadcast.splat11 + %gep.out = getelementptr inbounds i8, i8* %out, i32 %index + %cast.out = bitcast i8* %gep.out to <16 x i8>* + store <16 x i8> %res, <16 x i8>* %cast.out, align 4 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %n.vec + br i1 %cmp, label %exit, label %vector.body + +exit: + ret void +} + +define dso_local arm_aapcs_vfpcc void @sink_ashr_i32(i32* nocapture readonly %in, i32* noalias nocapture %out, i32 %shift, i32 %N) { +; CHECK-LABEL: sink_ashr_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r0, #16 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r1, #16 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! +; CHECK-NEXT: vshl.s32 q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1, #16]! +; CHECK-NEXT: le lr, .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.ph + +vector.ph: + %n.vec = and i32 %N, -4 + %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0 + %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.in = getelementptr inbounds i32, i32* %in, i32 %index + %cast.in = bitcast i32* %gep.in to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %cast.in, align 4 + %res = ashr <4 x i32> %wide.load, %broadcast.splat11 + %gep.out = getelementptr inbounds i32, i32* %out, i32 %index + %cast.out = bitcast i32* %gep.out to <4 x i32>* + store <4 x i32> %res, <4 x i32>* %cast.out, align 4 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %n.vec + br i1 %cmp, label %exit, label %vector.body + +exit: + ret void +} + +define dso_local arm_aapcs_vfpcc void @sink_ashr_i16(i16* nocapture readonly %in, i16* noalias nocapture %out, i16 %shift, i32 %N) { +; CHECK-LABEL: sink_ashr_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r0, #8 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r1, #8 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #8]! +; CHECK-NEXT: vshl.s16 q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1, #8]! +; CHECK-NEXT: le lr, .LBB7_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.ph + +vector.ph: + %n.vec = and i32 %N, -4 + %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0 + %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.in = getelementptr inbounds i16, i16* %in, i32 %index + %cast.in = bitcast i16* %gep.in to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %cast.in, align 4 + %res = ashr <8 x i16> %wide.load, %broadcast.splat11 + %gep.out = getelementptr inbounds i16, i16* %out, i32 %index + %cast.out = bitcast i16* %gep.out to <8 x i16>* + store <8 x i16> %res, <8 x i16>* %cast.out, align 4 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %n.vec + br i1 %cmp, label %exit, label %vector.body + +exit: + ret void +} + +define dso_local arm_aapcs_vfpcc void @sink_ashr_i8(i8* nocapture readonly %in, i8* noalias nocapture %out, i8 %shift, i32 %N) { +; CHECK-LABEL: sink_ashr_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #4]! +; CHECK-NEXT: vshl.s8 q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1, #4]! +; CHECK-NEXT: le lr, .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.ph + +vector.ph: + %n.vec = and i32 %N, -4 + %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0 + %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.in = getelementptr inbounds i8, i8* %in, i32 %index + %cast.in = bitcast i8* %gep.in to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %cast.in, align 4 + %res = ashr <16 x i8> %wide.load, %broadcast.splat11 + %gep.out = getelementptr inbounds i8, i8* %out, i32 %index + %cast.out = bitcast i8* %gep.out to <16 x i8>* + store <16 x i8> %res, <16 x i8>* %cast.out, align 4 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %n.vec + br i1 %cmp, label %exit, label %vector.body + +exit: + ret void +} Index: llvm/test/CodeGen/Thumb2/mve-shifts.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shifts.ll +++ llvm/test/CodeGen/Thumb2/mve-shifts.ll @@ -411,9 +411,8 @@ define arm_aapcs_vfpcc <4 x i32> @shru_qr_int32_t(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: shru_qr_int32_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vneg.s32 q1, q1 -; CHECK-NEXT: vshl.u32 q0, q0, q1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vshl.u32 q0, r0 ; CHECK-NEXT: bx lr entry: %i = insertelement <4 x i32> undef, i32 %src2, i32 0 @@ -477,9 +476,8 @@ define arm_aapcs_vfpcc <4 x i32> @shrs_qr_int32_t(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: shrs_qr_int32_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vneg.s32 q1, q1 -; CHECK-NEXT: vshl.s32 q0, q0, q1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vshl.s32 q0, r0 ; CHECK-NEXT: bx lr entry: %i = insertelement <4 x i32> undef, i32 %src2, i32 0