Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -10132,7 +10132,8 @@ else if (Mask[i] - i != AlignVal) return SDValue(); } - return DAG.getNode(X86ISD::VALIGN, DL, VT, V1, V2, + // Vector source operands should be swapped + return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1, DAG.getConstant(AlignVal, DL, MVT::i8)); } @@ -15167,6 +15168,30 @@ Src1,Src2), Mask, PassThru, Subtarget, DAG); } + case INTR_TYPE_3OP_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(6); + unsigned Round = cast(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3), + Mask, PassThru, Subtarget, DAG); + } case FMA_OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -15309,16 +15334,6 @@ return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1)); - case Intrinsic::x86_avx512_mask_valign_q_512: - case Intrinsic::x86_avx512_mask_valign_d_512: - // Vector source operands are swapped. - return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl, - Op.getValueType(), Op.getOperand(2), - Op.getOperand(1), - Op.getOperand(3)), - Op.getOperand(5), Op.getOperand(4), - Subtarget, DAG); - // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -5611,30 +5611,6 @@ (loadv8i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; -multiclass avx512_valign { - defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), - "valign"##_.Suffix, - "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86VAlign _.RC:$src2, _.RC:$src1, - (i8 imm:$src3)))>, - AVX512AIi8Base, EVEX_4V; - - // Also match valign of packed floats. - def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), - (!cast(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>; - - let mayLoad = 1 in - def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), - !strconcat("valign"##_.Suffix, - "\t{$src3, $src2, $src1, $dst|" - "$dst, $src1, $src2, $src3}"), - []>, EVEX_4V; -} -defm VALIGND : avx512_valign, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VALIGNQ : avx512_valign, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; @@ -6121,7 +6097,7 @@ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ let Predicates = [prd] in { defm Z : avx512_fp_packed_imm, - avx512_fp_sae_packed_imm, + avx512_fp_sae_packed_imm, EVEX_V512; } @@ -6133,6 +6109,17 @@ } } +multiclass avx512_common_3Op_imm8 opc, SDNode OpNode>{ + let Predicates = [HasAVX512] in { + defm Z : avx512_3Op_imm8, EVEX_V512; + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_3Op_imm8, EVEX_V128; + defm Z256 : avx512_3Op_imm8, EVEX_V256; + } +} + multiclass avx512_common_fp_sae_scalar_imm opc, SDNode OpNode, Predicate prd>{ let Predicates = [prd] in { @@ -6189,3 +6176,18 @@ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; + +multiclass avx512_valign{ + defm NAME: avx512_common_3Op_imm8, + AVX512AIi8Base, EVEX_4V; + let isCodeGenOnly = 1 in { + defm NAME#_FP: avx512_common_3Op_imm8, + AVX512AIi8Base, EVEX_4V; + } +} + +defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>, + EVEX_CD8<32, CD8VF>; +defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>, + EVEX_CD8<64, CD8VF>, VEX_W; Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -1696,8 +1696,8 @@ { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, - { X86::VALIGNQrri, X86::VALIGNQrmi, 0 }, - { X86::VALIGNDrri, X86::VALIGNDrmi, 0 }, + { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 }, + { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 }, { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -21,7 +21,8 @@ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, + INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, + INTR_TYPE_3OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, BLEND }; @@ -603,6 +604,8 @@ X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), Index: llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll +++ llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX ; CHECK-LABEL: test1: ; CHECK: vpermps @@ -250,3 +251,86 @@ %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %c } + +define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind { +; CHECK-LABEL: test_align_v16i32_rr: +; CHECK: ## BB#0: +; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %c +} + +define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind { +; CHECK-LABEL: test_align_v16i32_rm: +; CHECK: ## BB#0: +; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = load <16 x i32>, <16 x i32>* %a.ptr + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %c +} + +define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind { +; CHECK-LABEL: test_align_v16i32_rm_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1 +; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 +; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 +; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq +; +; CHECK-SKX-LABEL: test_align_v16i32_rm_mask: +; CHECK-SKX: ## BB#0: +; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1 +; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1 +; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} +; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-SKX-NEXT: retq + %a = load <16 x i32>, <16 x i32>* %a.ptr + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a + ret <16 x i32> %res +} + +define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind { +; CHECK-LABEL: test_align_v8f64_rr: +; CHECK: ## BB#0: +; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %c +} + +define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind { +; CHECK-LABEL: test_align_v18f64_rm: +; CHECK: ## BB#0: +; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = load <8 x double>, <8 x double>* %a.ptr + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %c +} + +define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind { +; CHECK-LABEL: test_align_v18f64_rm_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1 +; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq +; +; CHECK-SKX-LABEL: test_align_v18f64_rm_mask: +; CHECK-SKX: ## BB#0: +; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1 +; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-SKX-NEXT: retq + %a = load <8 x double>, <8 x double>* %a.ptr + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer + ret <8 x double> %res +} + Index: llvm/trunk/test/MC/X86/avx512-encodings.s =================================================================== --- llvm/trunk/test/MC/X86/avx512-encodings.s +++ llvm/trunk/test/MC/X86/avx512-encodings.s @@ -6084,6 +6084,66 @@ // CHECK: encoding: [0x62,0xf3,0xfd,0x49,0x03,0xcb,0x03] valignq $3, %zmm3, %zmm0, %zmm1 {%k1} +// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xe7,0xab] + valignq $0xab, %zmm23, %zmm4, %zmm28 + +// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 {%k3} +// CHECK: encoding: [0x62,0x23,0xdd,0x4b,0x03,0xe7,0xab] + valignq $0xab, %zmm23, %zmm4, %zmm28 {%k3} + +// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 {%k3} {z} +// CHECK: encoding: [0x62,0x23,0xdd,0xcb,0x03,0xe7,0xab] + valignq $0xab, %zmm23, %zmm4, %zmm28 {%k3} {z} + +// CHECK: valignq $123, %zmm23, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xe7,0x7b] + valignq $0x7b, %zmm23, %zmm4, %zmm28 + +// CHECK: valignq $123, (%rcx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x21,0x7b] + valignq $0x7b, (%rcx), %zmm4, %zmm28 + +// CHECK: valignq $123, 291(%rax,%r14,8), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b] + valignq $0x7b, 291(%rax,%r14,8), %zmm4, %zmm28 + +// CHECK: valignq $123, (%rcx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x21,0x7b] + valignq $0x7b, (%rcx){1to8}, %zmm4, %zmm28 + +// CHECK: valignq $123, 8128(%rdx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x62,0x7f,0x7b] + valignq $0x7b, 8128(%rdx), %zmm4, %zmm28 + +// CHECK: valignq $123, 8192(%rdx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0xa2,0x00,0x20,0x00,0x00,0x7b] + valignq $0x7b, 8192(%rdx), %zmm4, %zmm28 + +// CHECK: valignq $123, -8192(%rdx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x62,0x80,0x7b] + valignq $0x7b, -8192(%rdx), %zmm4, %zmm28 + +// CHECK: valignq $123, -8256(%rdx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0xa2,0xc0,0xdf,0xff,0xff,0x7b] + valignq $0x7b, -8256(%rdx), %zmm4, %zmm28 + +// CHECK: valignq $123, 1016(%rdx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x62,0x7f,0x7b] + valignq $0x7b, 1016(%rdx){1to8}, %zmm4, %zmm28 + +// CHECK: valignq $123, 1024(%rdx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0xa2,0x00,0x04,0x00,0x00,0x7b] + valignq $0x7b, 1024(%rdx){1to8}, %zmm4, %zmm28 + +// CHECK: valignq $123, -1024(%rdx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x62,0x80,0x7b] + valignq $0x7b, -1024(%rdx){1to8}, %zmm4, %zmm28 + +// CHECK: valignq $123, -1032(%rdx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0xa2,0xf8,0xfb,0xff,0xff,0x7b] + valignq $0x7b, -1032(%rdx){1to8}, %zmm4, %zmm28 + // CHECK: vextractf32x4 $3 // CHECK: encoding: [0x62,0xf3,0x7d,0x49,0x19,0xd9,0x03] vextractf32x4 $3, %zmm3, %xmm1 {%k1} Index: llvm/trunk/test/MC/X86/x86-64-avx512f_vl.s =================================================================== --- llvm/trunk/test/MC/X86/x86-64-avx512f_vl.s +++ llvm/trunk/test/MC/X86/x86-64-avx512f_vl.s @@ -11013,3 +11013,122 @@ // CHECK: encoding: [0x62,0x63,0xad,0x30,0x43,0x8a,0xf8,0xfb,0xff,0xff,0x7b] vshufi64x2 $0x7b, -1032(%rdx){1to4}, %ymm26, %ymm25 +// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0x83,0xed,0x00,0x03,0xd8,0xab] + valignq $0xab, %xmm24, %xmm18, %xmm19 + +// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 {%k5} +// CHECK: encoding: [0x62,0x83,0xed,0x05,0x03,0xd8,0xab] + valignq $0xab, %xmm24, %xmm18, %xmm19 {%k5} + +// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 {%k5} {z} +// CHECK: encoding: [0x62,0x83,0xed,0x85,0x03,0xd8,0xab] + valignq $0xab, %xmm24, %xmm18, %xmm19 {%k5} {z} + +// CHECK: valignq $123, %xmm24, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0x83,0xed,0x00,0x03,0xd8,0x7b] + valignq $0x7b, %xmm24, %xmm18, %xmm19 + +// CHECK: valignq $123, (%rcx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x19,0x7b] + valignq $0x7b, (%rcx), %xmm18, %xmm19 + +// CHECK: valignq $123, 291(%rax,%r14,8), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xa3,0xed,0x00,0x03,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b] + valignq $0x7b, 291(%rax,%r14,8), %xmm18, %xmm19 + +// CHECK: valignq $123, (%rcx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x19,0x7b] + valignq $0x7b, (%rcx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $123, 2032(%rdx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x5a,0x7f,0x7b] + valignq $0x7b, 2032(%rdx), %xmm18, %xmm19 + +// CHECK: valignq $123, 2048(%rdx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x9a,0x00,0x08,0x00,0x00,0x7b] + valignq $0x7b, 2048(%rdx), %xmm18, %xmm19 + +// CHECK: valignq $123, -2048(%rdx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x5a,0x80,0x7b] + valignq $0x7b, -2048(%rdx), %xmm18, %xmm19 + +// CHECK: valignq $123, -2064(%rdx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x9a,0xf0,0xf7,0xff,0xff,0x7b] + valignq $0x7b, -2064(%rdx), %xmm18, %xmm19 + +// CHECK: valignq $123, 1016(%rdx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x5a,0x7f,0x7b] + valignq $0x7b, 1016(%rdx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $123, 1024(%rdx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x9a,0x00,0x04,0x00,0x00,0x7b] + valignq $0x7b, 1024(%rdx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $123, -1024(%rdx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x5a,0x80,0x7b] + valignq $0x7b, -1024(%rdx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $123, -1032(%rdx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x9a,0xf8,0xfb,0xff,0xff,0x7b] + valignq $0x7b, -1032(%rdx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x03,0xbd,0x20,0x03,0xca,0xab] + valignq $0xab, %ymm26, %ymm24, %ymm25 + +// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 {%k2} +// CHECK: encoding: [0x62,0x03,0xbd,0x22,0x03,0xca,0xab] + valignq $0xab, %ymm26, %ymm24, %ymm25 {%k2} + +// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 {%k2} {z} +// CHECK: encoding: [0x62,0x03,0xbd,0xa2,0x03,0xca,0xab] + valignq $0xab, %ymm26, %ymm24, %ymm25 {%k2} {z} + +// CHECK: valignq $123, %ymm26, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x03,0xbd,0x20,0x03,0xca,0x7b] + valignq $0x7b, %ymm26, %ymm24, %ymm25 + +// CHECK: valignq $123, (%rcx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x09,0x7b] + valignq $0x7b, (%rcx), %ymm24, %ymm25 + +// CHECK: valignq $123, 291(%rax,%r14,8), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x23,0xbd,0x20,0x03,0x8c,0xf0,0x23,0x01,0x00,0x00,0x7b] + valignq $0x7b, 291(%rax,%r14,8), %ymm24, %ymm25 + +// CHECK: valignq $123, (%rcx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x09,0x7b] + valignq $0x7b, (%rcx){1to4}, %ymm24, %ymm25 + +// CHECK: valignq $123, 4064(%rdx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x4a,0x7f,0x7b] + valignq $0x7b, 4064(%rdx), %ymm24, %ymm25 + +// CHECK: valignq $123, 4096(%rdx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x8a,0x00,0x10,0x00,0x00,0x7b] + valignq $0x7b, 4096(%rdx), %ymm24, %ymm25 + +// CHECK: valignq $123, -4096(%rdx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x4a,0x80,0x7b] + valignq $0x7b, -4096(%rdx), %ymm24, %ymm25 + +// CHECK: valignq $123, -4128(%rdx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x8a,0xe0,0xef,0xff,0xff,0x7b] + valignq $0x7b, -4128(%rdx), %ymm24, %ymm25 + +// CHECK: valignq $123, 1016(%rdx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x4a,0x7f,0x7b] + valignq $0x7b, 1016(%rdx){1to4}, %ymm24, %ymm25 + +// CHECK: valignq $123, 1024(%rdx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x8a,0x00,0x04,0x00,0x00,0x7b] + valignq $0x7b, 1024(%rdx){1to4}, %ymm24, %ymm25 + +// CHECK: valignq $123, -1024(%rdx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x4a,0x80,0x7b] + valignq $0x7b, -1024(%rdx){1to4}, %ymm24, %ymm25 + +// CHECK: valignq $123, -1032(%rdx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x8a,0xf8,0xfb,0xff,0xff,0x7b] + valignq $0x7b, -1032(%rdx){1to4}, %ymm24, %ymm25