Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -5000,12 +5000,12 @@ Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtrndss">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_avx512_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtrndsd">, - Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; + def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtrndss_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtrndsd_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_sqrt_pd_128 : GCCBuiltin<"__builtin_ia32_sqrtpd128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -5487,67 +5487,6 @@ } } -multiclass avx512_sqrt_scalar opc, string OpcodeStr, - Intrinsic F32Int, Intrinsic F64Int, - OpndItins itins_s, OpndItins itins_d> { - def SSZr : SI, XS, EVEX_4V; - let isCodeGenOnly = 1 in - def SSZr_Int : SIi8, XS, EVEX_4V; - let mayLoad = 1 in { - def SSZm : SI, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; - let isCodeGenOnly = 1 in - def SSZm_Int : SIi8, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; - } - def SDZr : SI, - XD, EVEX_4V, VEX_W; - let isCodeGenOnly = 1 in - def SDZr_Int : SIi8, XD, EVEX_4V, VEX_W; - let mayLoad = 1 in { - def SDZm : SI, - XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; - let isCodeGenOnly = 1 in - def SDZm_Int : SIi8, - XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; - } -} - multiclass avx512_sqrt_packed_all opc, string OpcodeStr, SDNode OpNode> { defm PSZ : avx512_sqrt_packed, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; } +multiclass avx512_sqrt_scalar opc, string OpcodeStr,X86VectorVTInfo _, + string SUFF, SDNode OpNode, SDNode OpNodeRnd> { + + defm r_Int : AVX512_maskable_scalar; + let mayLoad = 1 in + defm m_Int : AVX512_maskable_scalar; + + defm rb_Int : AVX512_maskable_scalar, + EVEX_B, EVEX_RC; + + let isCodeGenOnly = 1 in { + def r : SI; + + let mayLoad = 1 in + def m : SI; + } + + def : Pat<(_.EltVT (OpNode _.FRC:$src)), + (!cast(NAME#SUFF#Zr) + (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; + + def : Pat<(_.EltVT (OpNode (load addr:$src))), + (!cast(NAME#SUFF#Zm) + (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>; +} + +multiclass avx512_sqrt_scalar_all opc, string OpcodeStr> { + defm SSZ : avx512_sqrt_scalar, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; + defm SDZ : avx512_sqrt_scalar, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W; +} + defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>, avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>; -defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", - int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, - SSE_SQRTSS, SSE_SQRTSD>; +defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG; let Predicates = [HasAVX512] in { - def : Pat<(f32 (fsqrt FR32X:$src)), - (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; - def : Pat<(f32 (fsqrt (load addr:$src))), - (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; - def : Pat<(f64 (fsqrt FR64X:$src)), - (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>; - def : Pat<(f64 (fsqrt (load addr:$src))), - (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; - def : Pat<(f32 (X86frsqrt FR32X:$src)), (VRSQRT14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>; def : Pat<(f32 (X86frsqrt (load addr:$src))), @@ -5611,20 +5590,6 @@ def : Pat<(f32 (X86frcp (load addr:$src))), (VRCP14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>; - - def : Pat<(int_x86_sse_sqrt_ss VR128X:$src), - (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128X:$src, FR32)), - VR128X)>; - def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), - (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; - - def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src), - (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128X:$src, FR64)), - VR128X)>; - def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), - (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; } multiclass Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -332,10 +332,11 @@ def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; -def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; -def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; -def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; -def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; +def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; +def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", STDFp2SrcRm>; def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>; Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -1261,6 +1261,10 @@ X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, X86ISD::FSQRT_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll @@ -148,19 +148,59 @@ } declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone -define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) { - ; CHECK: vsqrtss {{.*}}encoding: [0x62 - %res = call <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] +declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_sqrt_ss: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) + %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2) + %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3) + + %res.1 = fadd <4 x float> %res0, %res1 + %res.2 = fadd <4 x float> %res2, %res3 + %res = fadd <4 x float> %res.1, %res.2 ret <4 x float> %res } -declare <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float>, <4 x float>) nounwind readnone -define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1) { - ; CHECK: vsqrtsd {{.*}}encoding: [0x62 - %res = call <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1] +declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_sqrt_sd: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) + %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2) + %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3) + + %res.1 = fadd <2 x double> %res0, %res1 + %res.2 = fadd <2 x double> %res2, %res3 + %res = fadd <2 x double> %res.1, %res.2 ret <2 x double> %res } -declare <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double>, <2 x double>) nounwind readnone define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) { ; CHECK: vcvtsd2si {{.*}}encoding: [0x62 @@ -183,7 +223,6 @@ } declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone - define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) { ; CHECK: vcvtss2si {{.*}}encoding: [0x62 %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; [#uses=1] Index: llvm/trunk/test/MC/X86/avx512-encodings.s =================================================================== --- llvm/trunk/test/MC/X86/avx512-encodings.s +++ llvm/trunk/test/MC/X86/avx512-encodings.s @@ -14958,6 +14958,110 @@ // CHECK: encoding: [0x62,0xf2,0xc5,0x08,0x43,0x92,0xf8,0xfb,0xff,0xff] vgetexpsd -1032(%rdx), %xmm7, %xmm2 +// CHECK: vsqrtss %xmm8, %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x66,0x00,0x51,0xf0] + vsqrtss %xmm8, %xmm19, %xmm22 + +// CHECK: vsqrtss %xmm8, %xmm19, %xmm22 {%k1} +// CHECK: encoding: [0x62,0xc1,0x66,0x01,0x51,0xf0] + vsqrtss %xmm8, %xmm19, %xmm22 {%k1} + +// CHECK: vsqrtss %xmm8, %xmm19, %xmm22 {%k1} {z} +// CHECK: encoding: [0x62,0xc1,0x66,0x81,0x51,0xf0] + vsqrtss %xmm8, %xmm19, %xmm22 {%k1} {z} + +// CHECK: vsqrtss {rn-sae}, %xmm8, %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x66,0x10,0x51,0xf0] + vsqrtss {rn-sae}, %xmm8, %xmm19, %xmm22 + +// CHECK: vsqrtss {ru-sae}, %xmm8, %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x66,0x50,0x51,0xf0] + vsqrtss {ru-sae}, %xmm8, %xmm19, %xmm22 + +// CHECK: vsqrtss {rd-sae}, %xmm8, %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x66,0x30,0x51,0xf0] + vsqrtss {rd-sae}, %xmm8, %xmm19, %xmm22 + +// CHECK: vsqrtss {rz-sae}, %xmm8, %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xc1,0x66,0x70,0x51,0xf0] + vsqrtss {rz-sae}, %xmm8, %xmm19, %xmm22 + +// CHECK: vsqrtss (%rcx), %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0x31] + vsqrtss (%rcx), %xmm19, %xmm22 + +// CHECK: vsqrtss 291(%rax,%r14,8), %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xa1,0x66,0x00,0x51,0xb4,0xf0,0x23,0x01,0x00,0x00] + vsqrtss 291(%rax,%r14,8), %xmm19, %xmm22 + +// CHECK: vsqrtss 508(%rdx), %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0x72,0x7f] + vsqrtss 508(%rdx), %xmm19, %xmm22 + +// CHECK: vsqrtss 512(%rdx), %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0xb2,0x00,0x02,0x00,0x00] + vsqrtss 512(%rdx), %xmm19, %xmm22 + +// CHECK: vsqrtss -512(%rdx), %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0x72,0x80] + vsqrtss -512(%rdx), %xmm19, %xmm22 + +// CHECK: vsqrtss -516(%rdx), %xmm19, %xmm22 +// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0xb2,0xfc,0xfd,0xff,0xff] + vsqrtss -516(%rdx), %xmm19, %xmm22 + +// CHECK: vsqrtsd %xmm12, %xmm2, %xmm26 +// CHECK: encoding: [0x62,0x41,0xef,0x08,0x51,0xd4] + vsqrtsd %xmm12, %xmm2, %xmm26 + +// CHECK: vsqrtsd %xmm12, %xmm2, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xd1,0xef,0x0f,0x51,0xf4] + vsqrtsd %xmm12, %xmm2, %xmm6 {%k7} + +// CHECK: vsqrtsd %xmm12, %xmm2, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xd1,0xef,0x8f,0x51,0xf4] + vsqrtsd %xmm12, %xmm2, %xmm6 {%k7} {z} + +// CHECK: vsqrtsd {rn-sae}, %xmm12, %xmm2, %xmm6 +// CHECK: encoding: [0x62,0xd1,0xef,0x18,0x51,0xf4] + vsqrtsd {rn-sae}, %xmm12, %xmm2, %xmm6 + +// CHECK: vsqrtsd {ru-sae}, %xmm12, %xmm2, %xmm6 +// CHECK: encoding: [0x62,0xd1,0xef,0x58,0x51,0xf4] + vsqrtsd {ru-sae}, %xmm12, %xmm2, %xmm6 + +// CHECK: vsqrtsd {rd-sae}, %xmm12, %xmm2, %xmm6 +// CHECK: encoding: [0x62,0xd1,0xef,0x38,0x51,0xf4] + vsqrtsd {rd-sae}, %xmm12, %xmm2, %xmm6 + +// CHECK: vsqrtsd {rz-sae}, %xmm12, %xmm2, %xmm6 +// CHECK: encoding: [0x62,0xd1,0xef,0x78,0x51,0xf4] + vsqrtsd {rz-sae}, %xmm12, %xmm2, %xmm6 + +// CHECK: vsqrtsd (%rcx), %xmm2, %xmm26 +// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x11] + vsqrtsd (%rcx), %xmm2, %xmm26 + +// CHECK: vsqrtsd 291(%rax,%r14,8), %xmm2, %xmm26 +// CHECK: encoding: [0x62,0x21,0xef,0x08,0x51,0x94,0xf0,0x23,0x01,0x00,0x00] + vsqrtsd 291(%rax,%r14,8), %xmm2, %xmm26 + +// CHECK: vsqrtsd 1016(%rdx), %xmm2, %xmm26 +// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x52,0x7f] + vsqrtsd 1016(%rdx), %xmm2, %xmm26 + +// CHECK: vsqrtsd 1024(%rdx), %xmm2, %xmm26 +// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x92,0x00,0x04,0x00,0x00] + vsqrtsd 1024(%rdx), %xmm2, %xmm26 + +// CHECK: vsqrtsd -1024(%rdx), %xmm2, %xmm26 +// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x52,0x80] + vsqrtsd -1024(%rdx), %xmm2, %xmm26 + +// CHECK: vsqrtsd -1032(%rdx), %xmm2, %xmm26 +// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x92,0xf8,0xfb,0xff,0xff] + vsqrtsd -1032(%rdx), %xmm2, %xmm26 + // CHECK: vinsertf32x4 $171, %xmm3, %zmm26, %zmm11 // CHECK: encoding: [0x62,0x73,0x2d,0x40,0x18,0xdb,0xab] vinsertf32x4 $0xab, %xmm3, %zmm26, %zmm11