Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5148,7 +5148,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { @@ -5157,7 +5157,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { @@ -5166,7 +5166,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { @@ -5175,7 +5175,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { @@ -5184,7 +5184,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { @@ -5193,7 +5193,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } @@ -5348,7 +5348,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { @@ -5357,7 +5357,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { @@ -5366,7 +5366,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { @@ -5375,7 +5375,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { @@ -5384,7 +5384,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { @@ -5393,7 +5393,7 @@ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -4962,36 +4962,47 @@ def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>; // Natural vector casts (128 bit) def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; @@ -5003,6 +5014,7 @@ def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; Index: test/CodeGen/AArch64/aarch64-be-bv.ll =================================================================== --- test/CodeGen/AArch64/aarch64-be-bv.ll +++ test/CodeGen/AArch64/aarch64-be-bv.ll @@ -242,6 +242,138 @@ ret i16 %el } +; CHECK-LABEL: bic_modimm_t1: +define i16 @bic_modimm_t1() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #0x1 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = and <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: bic_modimm_t2: +define i16 @bic_modimm_t2() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #0x1, lsl #8 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = and <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: bic_modimm_t3: +define i16 @bic_modimm_t3() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #0x1, lsl #16 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = and <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: bic_modimm_t4: +define i16 @bic_modimm_t4() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #0x1, lsl #24 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = and <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: bic_modimm_t5: +define i16 @bic_modimm_t5() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: bic v[[REG2:[0-9]+]].8h, #0x1 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = and <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: bic_modimm_t6: +define i16 @bic_modimm_t6() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: bic v[[REG2:[0-9]+]].8h, #0x1, lsl #8 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = and <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: orr_modimm_t1: +define i16 @orr_modimm_t1() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #0x1 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = or <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: orr_modimm_t2: +define i16 @orr_modimm_t2() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #0x1, lsl #8 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = or <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: orr_modimm_t3: +define i16 @orr_modimm_t3() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #0x1, lsl #16 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = or <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: orr_modimm_t4: +define i16 @orr_modimm_t4() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #0x1, lsl #24 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = or <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: orr_modimm_t5: +define i16 @orr_modimm_t5() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: orr v[[REG2:[0-9]+]].8h, #0x1 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = or <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + +; CHECK-LABEL: orr_modimm_t6: +define i16 @orr_modimm_t6() nounwind { + ; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}] + ; CHECK-NEXT: orr v[[REG2:[0-9]+]].8h, #0x1, lsl #8 + ; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0] + %in = load <8 x i16>* @vec_v8i16 + %rv = or <8 x i16> %in, + %el = extractelement <8 x i16> %rv, i32 0 + ret i16 %el +} + declare i8 @f_v8i8(<8 x i8> %arg) declare i16 @f_v4i16(<4 x i16> %arg) declare i32 @f_v2i32(<2 x i32> %arg) @@ -249,8 +381,8 @@ declare i16 @f_v8i16(<8 x i16> %arg) declare i32 @f_v4i32(<4 x i32> %arg) -; CHECK-LABEL: movi_modimm_t1_call: -define void @movi_modimm_t1_call() { +; CHECK-LABEL: modimm_t1_call: +define void @modimm_t1_call() { ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -282,8 +414,8 @@ ret void } -; CHECK-LABEL: movi_modimm_t2_call: -define void @movi_modimm_t2_call() { +; CHECK-LABEL: modimm_t2_call: +define void @modimm_t2_call() { ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, lsl #8 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -315,8 +447,8 @@ ret void } -; CHECK-LABEL: movi_modimm_t3_call: -define void @movi_modimm_t3_call() { +; CHECK-LABEL: modimm_t3_call: +define void @modimm_t3_call() { ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, lsl #16 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -348,8 +480,8 @@ ret void } -; CHECK-LABEL: movi_modimm_t4_call: -define void @movi_modimm_t4_call() { +; CHECK-LABEL: modimm_t4_call: +define void @modimm_t4_call() { ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, lsl #24 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -381,8 +513,8 @@ ret void } -; CHECK-LABEL: movi_modimm_t5_call: -define void @movi_modimm_t5_call() { +; CHECK-LABEL: modimm_t5_call: +define void @modimm_t5_call() { ; CHECK: movi v[[REG1:[0-9]+]].4h, #0x8 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -414,8 +546,8 @@ ret void } -; CHECK-LABEL: movi_modimm_t6_call: -define void @movi_modimm_t6_call() { +; CHECK-LABEL: modimm_t6_call: +define void @modimm_t6_call() { ; CHECK: movi v[[REG1:[0-9]+]].4h, #0x8, lsl #8 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -447,8 +579,8 @@ ret void } -; CHECK-LABEL: movi_modimm_t7_call: -define void @movi_modimm_t7_call() { +; CHECK-LABEL: modimm_t7_call: +define void @modimm_t7_call() { ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, msl #8 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -480,8 +612,8 @@ ret void } -; CHECK-LABEL: movi_modimm_t8_call: -define void @movi_modimm_t8_call() { +; CHECK-LABEL: modimm_t8_call: +define void @modimm_t8_call() { ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, msl #16 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -513,8 +645,8 @@ ret void } -; CHECK-LABEL: movi_modimm_t9_call: -define void @movi_modimm_t9_call() { +; CHECK-LABEL: modimm_t9_call: +define void @modimm_t9_call() { ; CHECK: movi v[[REG1:[0-9]+]].8b, #0x8 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -546,8 +678,8 @@ ret void } -; CHECK-LABEL: movi_modimm_t10_call: -define void @movi_modimm_t10_call() { +; CHECK-LABEL: modimm_t10_call: +define void @modimm_t10_call() { ; CHECK: movi d[[REG1:[0-9]+]], #0x0000ff000000ff ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -579,8 +711,8 @@ ret void } -; CHECK-LABEL: fmov_modimm_t11_call: -define void @fmov_modimm_t11_call() { +; CHECK-LABEL: modimm_t11_call: +define void @modimm_t11_call() { ; CHECK: fmov v[[REG1:[0-9]+]].2s, #4.00000000 ; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b ; CHECK-NEXT: bl f_v8i8 @@ -612,8 +744,8 @@ ret void } -; CHECK-LABEL: fmov_modimm_t12_call: -define void @fmov_modimm_t12_call() { +; CHECK-LABEL: modimm_t12_call: +define void @modimm_t12_call() { ; CHECK: fmov v[[REG1:[0-9]+]].2d, #0.18750000 ; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b ; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8