diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -638,6 +638,18 @@ InputTy = PreTruncTy; } } + if (InputTy->getPrimitiveSizeInBits() < 128 && + InputTy->isIntOrIntVectorTy()) { + // If we can't find a trunc to incorporate into the instruction, create an + // implicit one with a zext, so that we can still create a scatter. We know + // that the input type is 4x/8x/16x and of type i8/i16/i32, so any type + // smaller than 128 bits will divide evenly into a 128bit vector. + InputTy = InputTy->getWithNewBitWidth( + 128 / cast(InputTy)->getNumElements()); + Input = Builder.CreateZExt(Input, InputTy); + LLVM_DEBUG(dbgs() << "masked scatters: Small input type, extending with:\n" + << *Input << "\n"); + } if (InputTy->getPrimitiveSizeInBits() != 128) { LLVM_DEBUG(dbgs() << "masked scatters: cannot create scatters for " "non-standard input types. Expanding.\n"); diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll @@ -361,19 +361,11 @@ define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) { ; CHECK-LABEL: trunc_signed_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, r12, d2 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: strb.w r3, [r12] -; CHECK-NEXT: strb r4, [r2] -; CHECK-NEXT: strb.w r5, [lr] -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov.i32 q1, #0xff +; CHECK-NEXT: vldrb.s32 q2, [r1] +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vstrb.32 q0, [r0, q2] +; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> @@ -386,19 +378,11 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, r12, d2 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: strb.w r3, [r12] -; CHECK-NEXT: strb r4, [r2] -; CHECK-NEXT: strb.w r5, [lr] -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov.i32 q1, #0xff +; CHECK-NEXT: vldrb.u32 q2, [r1] +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vstrb.32 q0, [r0, q2] +; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -16,37 +16,13 @@ ret void } -; Expanded ? define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) { ; CHECK-LABEL: unscaled_v8i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmov.u16 r6, q0[0] -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov r12, lr, d3 -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov r4, r5, d3 -; CHECK-NEXT: strb r6, [r2] -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: strb r2, [r3] -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: strb.w r2, [r12] -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: strb.w r2, [lr] -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: strb r2, [r0] -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: strb r0, [r1] -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: strb r0, [r4] -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: strb r0, [r5] -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vstrb.16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -245,12 +245,10 @@ define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x i16*> %offs) { ; CHECK-LABEL: ptr_v4i16_dup: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: vmov r3, r12, d1 -; CHECK-NEXT: strh r0, [r1] -; CHECK-NEXT: strh r0, [r2] -; CHECK-NEXT: strh r0, [r3] -; CHECK-NEXT: strh.w r0, [r12] +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1, q0] ; CHECK-NEXT: bx lr entry: %ext = trunc i32 %v to i16