Index: lib/Target/ARM/ARMInstrMVE.td =================================================================== --- lib/Target/ARM/ARMInstrMVE.td +++ lib/Target/ARM/ARMInstrMVE.td @@ -5008,16 +5008,46 @@ def maskedload : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (masked_ld node:$ptr, node:$pred, node:$passthru)>; -def alignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ - return cast(N)->getAlignment() >= 4; +def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore8 node:$val, node:$ptr, node:$pred), [{ + return cast(N)->isTruncatingStore(); +}]>; +def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + auto *St = cast(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return ScalarVT == MVT::i16 || ScalarVT == MVT::f16; +}]>; +def unalignedmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore16 node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getAlignment() < 2; }]>; def alignedmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ + (maskedstore16 node:$val, node:$ptr, node:$pred), [{ return cast(N)->getAlignment() >= 2; }]>; -def maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred)>; +def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (alignedmaskedstore16 node:$val, node:$ptr, node:$pred), [{ + return cast(N)->isTruncatingStore(); +}]>; +def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + auto *St = cast(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return ScalarVT == MVT::i32 || ScalarVT == MVT::f32; +}]>; +def unalignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore32 node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getAlignment() < 4; +}]>; +def alignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore32 node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getAlignment() >= 4; +}]>; let Predicates = [HasMVEInt, IsLE] in { // Stores @@ -5039,13 +5069,13 @@ defm : MVE_vector_offset_store; // Unaligned masked stores (aligned are below) - def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + def : Pat<(unalignedmaskedstore32 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + def : Pat<(unalignedmaskedstore32 (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + def : Pat<(unalignedmaskedstore16 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + def : Pat<(unalignedmaskedstore16 (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; // Unaligned masked loads @@ -5113,13 +5143,13 @@ def : MVE_vector_offset_store_typed; // Unaligned masked stores (aligned are below) - def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + def : Pat<(unalignedmaskedstore32 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + def : Pat<(unalignedmaskedstore32 (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + def : Pat<(unalignedmaskedstore16 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + def : Pat<(unalignedmaskedstore16 (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; // Unaligned masked loads def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))), @@ -5134,7 +5164,7 @@ let Predicates = [HasMVEInt] in { // Aligned masked store, shared between LE and BE - def : MVE_vector_maskedstore_typed; + def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; @@ -5145,6 +5175,13 @@ def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; + // Truncating stores + def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; } // Widening/Narrowing Loads/Stores Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -495,15 +495,27 @@ if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) return false; - if (DataTy->isVectorTy()) { - // We don't yet support narrowing or widening masked loads/stores. Expand - // them for the moment. - unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); - if (VecWidth != 128) + unsigned EltWidth = DataTy->getScalarSizeInBits(); + if (auto *VecTy = dyn_cast(DataTy)) { + // Don't support v2i1 yet. + if (VecTy->getNumElements() == 2) return false; - } - unsigned EltWidth = DataTy->getScalarSizeInBits(); + unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); + if (VecWidth != 128) { + // We don't support extending fp types. + if (VecTy->getElementType()->isFloatingPointTy()) + return false; + + // Only support extending integers if the memory is aligned. + if ((EltWidth == 16 && Alignment < 2) || + (EltWidth == 32 && Alignment < 4)) + return false; + } + + if (VecWidth > 128) + return false; + } return EltWidth == 32 || EltWidth == 16 || EltWidth == 8; } Index: test/CodeGen/Thumb2/mve-masked-store.ll =================================================================== --- test/CodeGen/Thumb2/mve-masked-store.ll +++ test/CodeGen/Thumb2/mve-masked-store.ll @@ -638,7 +638,154 @@ ret void } +define arm_aapcs_vfpcc void @masked_v4i16(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vstrht.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vstrht.32 q1, [r0] +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %trunc = trunc <4 x i32> %a to <4 x i16> + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %trunc, <4 x i16>* %dest, i32 2, <4 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v4i8(<4 x i8> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vstrbt.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vstrbt.32 q1, [r0] +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %trunc = trunc <4 x i32> %a to <4 x i8> + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %trunc, <4 x i8>* %dest, i32 1, <4 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v8i8(<8 x i8> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s16 gt, q0, zr +; CHECK-LE-NEXT: vstrbt.16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vpt.s16 gt, q1, zr +; CHECK-BE-NEXT: vstrbt.16 q1, [r0] +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %trunc = trunc <8 x i16> %a to <8 x i8> + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %trunc, <8 x i8>* %dest, i32 1, <8 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v4i16_align1(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i16_align1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r1, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r1, #0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne r2, s0 +; CHECK-LE-NEXT: strhne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s1 +; CHECK-LE-NEXT: strhmi r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s2 +; CHECK-LE-NEXT: strhmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, s3 +; CHECK-LE-NEXT: strhmi r1, [r0, #6] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i16_align1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r1, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r1, #0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne r2, s4 +; CHECK-BE-NEXT: strhne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s5 +; CHECK-BE-NEXT: strhmi r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s6 +; CHECK-BE-NEXT: strhmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, s7 +; CHECK-BE-NEXT: strhmi r1, [r0, #6] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %trunc = trunc <4 x i32> %a to <4 x i16> + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %trunc, <4 x i16>* %dest, i32 1, <4 x i1> %c) + ret void +} + +declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)