Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -1352,14 +1352,14 @@ defm rr: AVX512_maskable_3src, EVEX_4V, + (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V, AVX5128IBase; defm rm: AVX512_maskable_3src, + (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>, EVEX_4V, AVX5128IBase; } } @@ -1371,8 +1371,8 @@ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermi2X _.RC:$src1, - _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, - AVX5128IBase, EVEX_4V, EVEX_B; + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), + 1>, AVX5128IBase, EVEX_4V, EVEX_B; } multiclass avx512_perm_i_sizes opc, string OpcodeStr, @@ -1420,14 +1420,14 @@ defm rr: AVX512_maskable_3src, EVEX_4V, - AVX5128IBase; + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>, + EVEX_4V, AVX5128IBase; defm rm: AVX512_maskable_3src, + (bitconvert (_.LdFrag addr:$src3)))), 1>, EVEX_4V, AVX5128IBase; } } @@ -1439,8 +1439,8 @@ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src1, - IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, - AVX5128IBase, EVEX_4V, EVEX_B; + IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), + 1>, AVX5128IBase, EVEX_4V, EVEX_B; } multiclass avx512_perm_t_sizes opc, string OpcodeStr, Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -3533,6 +3533,92 @@ return true; } +// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be +// commuted. +static bool isCommutableVPERMV3Instruction(unsigned Opcode) { +#define VPERM_CASES(Suffix) \ + case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \ + case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \ + case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \ + case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \ + case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \ + case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \ + case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \ + case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \ + case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \ + case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \ + case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \ + case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz: + +#define VPERM_CASES_BROADCAST(Suffix) \ + VPERM_CASES(Suffix) \ + case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \ + case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \ + case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \ + case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \ + case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \ + case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz: + + switch (Opcode) { + default: return false; + VPERM_CASES(B) + VPERM_CASES_BROADCAST(D) + VPERM_CASES_BROADCAST(PD) + VPERM_CASES_BROADCAST(PS) + VPERM_CASES_BROADCAST(Q) + VPERM_CASES(W) + return true; + } +#undef VPERM_CASES_BROADCAST +#undef VPERM_CASES +} + +// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching +// from the I opcod to the T opcode and vice versa. +static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) { +#define VPERM_CASES(Orig, New) \ + case X86::Orig##128rr: return X86::New##128rr; \ + case X86::Orig##128rrkz: return X86::New##128rrkz; \ + case X86::Orig##128rm: return X86::New##128rm; \ + case X86::Orig##128rmkz: return X86::New##128rmkz; \ + case X86::Orig##256rr: return X86::New##256rr; \ + case X86::Orig##256rrkz: return X86::New##256rrkz; \ + case X86::Orig##256rm: return X86::New##256rm; \ + case X86::Orig##256rmkz: return X86::New##256rmkz; \ + case X86::Orig##rr: return X86::New##rr; \ + case X86::Orig##rrkz: return X86::New##rrkz; \ + case X86::Orig##rm: return X86::New##rm; \ + case X86::Orig##rmkz: return X86::New##rmkz; + +#define VPERM_CASES_BROADCAST(Orig, New) \ + VPERM_CASES(Orig, New) \ + case X86::Orig##128rmb: return X86::New##128rmb; \ + case X86::Orig##128rmbkz: return X86::New##128rmbkz; \ + case X86::Orig##256rmb: return X86::New##256rmb; \ + case X86::Orig##256rmbkz: return X86::New##256rmbkz; \ + case X86::Orig##rmb: return X86::New##rmb; \ + case X86::Orig##rmbkz: return X86::New##rmbkz; + + switch (Opcode) { + VPERM_CASES(VPERMI2B, VPERMT2B) + VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D) + VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD) + VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS) + VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q) + VPERM_CASES(VPERMI2W, VPERMT2W) + VPERM_CASES(VPERMT2B, VPERMI2B) + VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D) + VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD) + VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS) + VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q) + VPERM_CASES(VPERMT2W, VPERMI2W) + } + + llvm_unreachable("Unreachable!"); +#undef VPERM_CASES_BROADCAST +#undef VPERM_CASES +} + MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { @@ -3854,7 +3940,15 @@ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } - default: + default: { + if (isCommutableVPERMV3Instruction(MI.getOpcode())) { + unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode()); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + const X86InstrFMA3Group *FMA3Group = X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); if (FMA3Group) { @@ -3870,6 +3964,7 @@ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } + } } bool X86InstrInfo::findFMA3CommutedOpIndices( @@ -4041,12 +4136,26 @@ // Handled masked instructions since we need to skip over the mask input // and the preserved input. if (Desc.TSFlags & X86II::EVEX_K) { + // First assume that the first input is the mask operand and skip past it. unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1; - // If there is no preserved input we only need to skip 1 operand. - if (MI.getDesc().getOperandConstraint(Desc.getNumDefs(), - MCOI::TIED_TO) != -1) - ++CommutableOpIdx1; - unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1; + unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2; + // Check if the first input is tied. If there isn't one then we only + // need to skip the mask operand which we did above. + if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(), + MCOI::TIED_TO) != -1)) { + // If this is zero masking instruction with a tied operand, we need to + // move the first index back to the first input since this must + // be a 3 input instruction and we want the first two non-mask inputs. + // Otherwise this is a 2 input instruction with a preserved input and + // mask, so we need to move the indices to skip one more input. + if (Desc.TSFlags & X86II::EVEX_Z) + --CommutableOpIdx1; + else { + ++CommutableOpIdx1; + ++CommutableOpIdx2; + } + } + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1, CommutableOpIdx2)) return false; Index: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll +++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll @@ -369,8 +369,8 @@ ; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} ; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] -; KNL-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm0 +; KNL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpslld $31, %zmm2, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq @@ -384,8 +384,8 @@ ; SKX-NEXT: vpmovm2d %k1, %zmm0 ; SKX-NEXT: vpmovm2d %k0, %zmm1 ; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] -; SKX-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; SKX-NEXT: vpmovd2m %zmm0, %k0 +; SKX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpmovd2m %zmm2, %k0 ; SKX-NEXT: kmovw %k0, %eax ; SKX-NEXT: retq %x = load i1 , i1 * %addr, align 128 @@ -406,8 +406,8 @@ ; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} ; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] -; KNL-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; KNL-NEXT: vpsllq $63, %zmm1, %zmm0 +; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq @@ -421,8 +421,8 @@ ; SKX-NEXT: vpmovm2q %k1, %zmm0 ; SKX-NEXT: vpmovm2q %k0, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] -; SKX-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; SKX-NEXT: vpmovq2m %zmm0, %k0 +; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpmovq2m %zmm2, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: retq %x = load i1 , i1 * %addr, align 128 @@ -1217,8 +1217,8 @@ ; SKX-NEXT: vpmovm2w %k1, %zmm0 ; SKX-NEXT: vpmovm2w %k0, %zmm1 ; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] -; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 -; SKX-NEXT: vpmovw2m %zmm0, %k0 +; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpmovw2m %zmm2, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: retq %cmp_res_i1 = icmp ult i32 %a, %b @@ -1249,14 +1249,14 @@ ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7] -; KNL-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; KNL-NEXT: vpsllq $63, %zmm3, %zmm2 +; KNL-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; KNL-NEXT: vpsllq $63, %zmm4, %zmm2 ; KNL-NEXT: vptestmq %zmm2, %zmm2, %k2 ; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z} ; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7] -; KNL-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 -; KNL-NEXT: vpsllq $63, %zmm2, %zmm2 +; KNL-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; KNL-NEXT: vpsllq $63, %zmm4, %zmm2 ; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z} ; KNL-NEXT: vpextrd $3, %xmm0, %eax @@ -1264,8 +1264,8 @@ ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7] -; KNL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 +; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq @@ -1310,8 +1310,8 @@ ; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} ; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; KNL-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; KNL-NEXT: vpsllq $63, %zmm1, %zmm0 +; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll @@ -643,8 +643,8 @@ ; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; KNL-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: kshiftrw $1, %k1, %k1 @@ -665,8 +665,8 @@ ; SKX-NEXT: vpmovm2q %k0, %zmm0 ; SKX-NEXT: vpmovm2q %k1, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] -; SKX-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; SKX-NEXT: vpmovq2m %zmm0, %k0 +; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpmovq2m %zmm2, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 ; SKX-NEXT: kshiftrb $1, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k2, %k1 Index: llvm/trunk/test/CodeGen/X86/avx512-vpermv3-commute.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-vpermv3-commute.ll +++ llvm/trunk/test/CodeGen/X86/avx512-vpermv3-commute.ll @@ -8,8 +8,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2d (%rdi), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpermt2d (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2p %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) @@ -21,8 +20,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) ret <8 x double> %res @@ -33,8 +31,7 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) ret <16 x float> %res @@ -45,8 +42,7 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) ret <8 x i64> %res @@ -58,8 +54,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpermi2d (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2p %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) @@ -72,8 +67,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vpermi2pd (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2s = load double, double* %x2ptr %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 @@ -88,8 +82,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) ret <16 x float> %res @@ -102,8 +95,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) ret <8 x i64> %res @@ -114,8 +106,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) ret <16 x i32> %res @@ -126,8 +117,7 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 +; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) ret <4 x i32> %res @@ -139,8 +129,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 +; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) ret <4 x i32> %res @@ -150,8 +139,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128_broadcast: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vpermt2d (%rdi){1to4}, %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 +; CHECK-NEXT: vpermi2d (%rdi){1to4}, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %x2s = load i32, i32* %x2ptr %x2ins = insertelement <4 x i32> undef, i32 %x2s, i32 0 @@ -165,8 +153,7 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 +; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) ret <8 x i32> %res @@ -178,8 +165,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 +; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) ret <8 x i32> %res @@ -190,8 +176,7 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) ret <2 x double> %res @@ -202,8 +187,7 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) ret <4 x double> %res @@ -214,8 +198,7 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) ret <4 x float> %res @@ -226,8 +209,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) ret <8 x float> %res @@ -236,8 +218,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_load(<8 x float> %x0, <8 x i32> %x1, <8 x float>* %x2p) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256_load: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2ps (%rdi), %ymm0, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vpermt2ps (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %x2 = load <8 x float>, <8 x float>* %x2p %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) @@ -247,8 +228,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_broadcast(<8 x float> %x0, <8 x i32> %x1, float* %x2ptr) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256_broadcast: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2ps (%rdi){1to8}, %ymm0, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vpermt2ps (%rdi){1to8}, %ymm1, %ymm0 ; CHECK-NEXT: retq %x2s = load float, float* %x2ptr %x2ins = insertelement <8 x float> undef, float %x2s, i32 0 @@ -262,8 +242,7 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 +; CHECK-NEXT: vpermt2b %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) ret <16 x i8> %res @@ -274,8 +253,7 @@ define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 +; CHECK-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) ret <32 x i8> %res @@ -286,8 +264,7 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 +; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) ret <16 x i8> %res @@ -296,8 +273,7 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128_load(<16 x i8> %x0, <16 x i8> %x1, <16 x i8>* %x2p) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128_load: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermt2b (%rdi), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 +; CHECK-NEXT: vpermi2b (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %x2 = load <16 x i8>, <16 x i8>* %x2p %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) @@ -309,8 +285,7 @@ define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 +; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) ret <32 x i8> %res @@ -322,8 +297,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 +; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) ret <16 x i8> %res @@ -333,8 +307,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128_load: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpermt2b (%rdi), %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 +; CHECK-NEXT: vpermi2b (%rdi), %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <16 x i8>, <16 x i8>* %x2p %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) @@ -347,8 +320,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 +; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) ret <32 x i8> %res @@ -358,8 +330,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256_load: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpermt2b (%rdi), %ymm0, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 +; CHECK-NEXT: vpermi2b (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <32 x i8>, <32 x i8>* %x2p %res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) Index: llvm/trunk/test/CodeGen/X86/avx512vbmi-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vbmi-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512vbmi-intrinsics.ll @@ -90,8 +90,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) ret <64 x i8> %res Index: llvm/trunk/test/CodeGen/X86/avx512vbmivl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vbmivl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512vbmivl-intrinsics.ll @@ -175,8 +175,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xca] -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0xc1] +; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x75,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) ret <16 x i8> %res @@ -188,8 +187,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xca] -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0xc1] +; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x75,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) ret <32 x i8> %res Index: llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -138,19 +138,19 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovupd 8(%rdi), %zmm0 -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vmovapd {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7> -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vmovupd 8(%rdi), %zmm1 +; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; ALL-NEXT: vmovapd {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7> +; ALL-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 -; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; X32-AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> -; X32-AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm1 +; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; X32-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> +; X32-AVX512F-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3 @@ -225,19 +225,19 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_i64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7> -; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm1 +; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; ALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7> +; ALL-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 -; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> -; X32-AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm1 +; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> +; X32-AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3 @@ -334,19 +334,19 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; ALL: # BB#0: -; ALL-NEXT: vmovups (%rdi), %zmm0 -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vmovaps {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vmovups (%rdi), %zmm1 +; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovups (%eax), %zmm0 -; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; X32-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X32-AVX512F-NEXT: vmovups (%eax), %zmm1 +; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; X32-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3 @@ -448,19 +448,19 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu32 (%rdi), %zmm0 -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vmovdqu32 (%rdi), %zmm1 +; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; ALL-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; ALL-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 -; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; X32-AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm1 +; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; X32-AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -877,8 +877,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31] -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -910,8 +910,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28] -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -941,8 +941,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24] -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3279,8 +3279,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3313,8 +3313,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3476,8 +3476,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3504,8 +3504,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u> -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3628,8 +3628,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3774,8 +3774,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3938,8 +3938,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3986,8 +3986,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u> -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -4167,8 +4167,8 @@ ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] -; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX512VL-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> %2 = bitcast <16 x i16> %1 to <4 x i64> @@ -4257,8 +4257,8 @@ ; AVX512VL-LABEL: PR24935: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8] -; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -312,10 +312,9 @@ ; ; AVX512VL-LABEL: shuffle_v8f32_08991abb: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,1,1,10,2,3,3] -; AVX512VL-NEXT: vpermt2ps %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3] +; AVX512VL-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -675,8 +674,8 @@ ; AVX512VL-LABEL: shuffle_v8f32_c348cda0: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8] -; AVX512VL-NEXT: vpermt2ps %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1316,9 +1315,9 @@ ; ; AVX512VL-LABEL: shuffle_v8i32_08192a3b: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm2 = [0,8,2,9,4,10,6,11] -; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [0,8,2,9,4,10,6,11] +; AVX512VL-NEXT: vpermi2d %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1345,10 +1344,9 @@ ; ; AVX512VL-LABEL: shuffle_v8i32_08991abb: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm2 = [8,0,1,1,10,2,3,3] -; AVX512VL-NEXT: vpermt2d %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3] +; AVX512VL-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1992,8 +1990,8 @@ ; AVX512VL-LABEL: shuffle_v8i32_6caa87e5: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm2 = [14,4,2,2,0,15,6,13] -; AVX512VL-NEXT: vpermt2d %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 +; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -262,15 +262,15 @@ ; AVX512F-LABEL: shuffle_v8f64_8823cc67: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_8823cc67: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -281,15 +281,15 @@ ; AVX512F-LABEL: shuffle_v8f64_9832dc76: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_9832dc76: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -300,15 +300,15 @@ ; AVX512F-LABEL: shuffle_v8f64_9810dc54: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_9810dc54: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -370,15 +370,15 @@ ; AVX512F-LABEL: shuffle_v8f64_08991abb: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_08991abb: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -406,15 +406,15 @@ ; AVX512F-LABEL: shuffle_v8f64_09ab1def: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_09ab1def: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -927,15 +927,15 @@ ; AVX512F-LABEL: shuffle_v8f64_c348cda0: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_c348cda0: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,11,0,12,0,0,0,4,0,5,0,2,0,8,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -1180,15 +1180,15 @@ ; AVX512F-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1233,15 +1233,15 @@ ; AVX512F-LABEL: shuffle_v8i64_8823cc67: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_8823cc67: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1252,15 +1252,15 @@ ; AVX512F-LABEL: shuffle_v8i64_9832dc76: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_9832dc76: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1271,15 +1271,15 @@ ; AVX512F-LABEL: shuffle_v8i64_9810dc54: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_9810dc54: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1341,15 +1341,15 @@ ; AVX512F-LABEL: shuffle_v8i64_08991abb: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_08991abb: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1377,15 +1377,15 @@ ; AVX512F-LABEL: shuffle_v8i64_09ab1def: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_09ab1def: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1914,15 +1914,15 @@ ; AVX512F-LABEL: shuffle_v8i64_6caa87e5: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_6caa87e5: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,0,4,0,2,0,2,0,0,0,15,0,6,0,13,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -123,18 +123,18 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k1 ; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] -; X32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 {%k1} {z} -; X32-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] -; X32-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X32-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z} +; X32-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X32-NEXT: vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_8f64_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] -; X64-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 {%k1} {z} -; X64-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] -; X64-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z} +; X64-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X64-NEXT: vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %x0, <8 x double> %x1, i8 %m) %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %res0, <8 x double> %res0, i8 %m) @@ -190,9 +190,9 @@ ; X32-LABEL: combine_vpermt2var_8i64_identity: ; X32: # BB#0: ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; X32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; X32-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 +; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; X32-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_8i64_identity: @@ -208,18 +208,18 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k1 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] -; X32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 {%k1} {z} -; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] -; X32-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} +; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z} +; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X32-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_8i64_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] -; X64-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 {%k1} {z} -; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] -; X64-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z} +; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X64-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 %m) %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %res0, <8 x i64> %res0, i8 %m) @@ -243,18 +243,18 @@ ; X32: # BB#0: ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z} -; X32-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X32-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} +; X32-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z} +; X32-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X32-NEXT: vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16f32_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z} -; X64-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z} +; X64-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-NEXT: vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %res0, <16 x float> %res0, i16 %m) @@ -280,17 +280,17 @@ ; X32-LABEL: combine_vpermt2var_16f32_vmovddup_load: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps (%eax), %zmm1 -; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] -; X32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; X32-NEXT: vmovaps (%eax), %zmm2 +; X32-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; X32-NEXT: vmovaps %zmm1, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load: ; X64: # BB#0: -; X64-NEXT: vmovaps (%rdi), %zmm1 -; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] -; X64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; X64-NEXT: vmovaps (%rdi), %zmm2 +; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; X64-NEXT: vmovaps %zmm1, %zmm0 ; X64-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 @@ -319,18 +319,18 @@ ; X32: # BB#0: ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps (%eax), %zmm1 -; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] -; X32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z} +; X32-NEXT: vmovaps (%eax), %zmm2 +; X32-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z} ; X32-NEXT: vmovaps %zmm1, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: ; X64: # BB#0: ; X64-NEXT: kmovw %esi, %k1 -; X64-NEXT: vmovaps (%rdi), %zmm1 -; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] -; X64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z} +; X64-NEXT: vmovaps (%rdi), %zmm2 +; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z} ; X64-NEXT: vmovaps %zmm1, %zmm0 ; X64-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 @@ -521,18 +521,18 @@ ; X32: # BB#0: ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X32-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 {%k1} {z} -; X32-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X32-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} +; X32-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z} +; X32-NEXT: vmovdqa32 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X32-NEXT: vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16i32_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 {%k1} {z} -; X64-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z} +; X64-NEXT: vmovdqa32 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-NEXT: vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 %m) %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %res0, <16 x i32> %res0, i16 %m) @@ -556,18 +556,18 @@ ; X32: # BB#0: ; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X32-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 {%k1} {z} -; X32-NEXT: vmovdqu16 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] -; X32-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} +; X32-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z} +; X32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] +; X32-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_32i16_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 {%k1} {z} -; X64-NEXT: vmovdqu16 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] -; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z} +; X64-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] +; X64-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %x0, <32 x i16> %x1, i32 %m) %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %res0, <32 x i16> %res0, i32 %m) @@ -938,9 +938,9 @@ ; X32-LABEL: combine_vpermt2var_8i64_as_vpermq: ; X32: # BB#0: ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0] -; X32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,0,5,0,14,0,7,0,8,0,1,0,10,0,3,0] -; X32-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 +; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [12,0,5,0,14,0,7,0,8,0,1,0,10,0,3,0] +; X32-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_8i64_as_vpermq: @@ -1008,15 +1008,15 @@ ; X32-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: ; X32: # BB#0: ; X32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0] -; X32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; X32-NEXT: vmovapd %zmm1, %zmm0 +; X32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; X32-NEXT: vmovapd %zmm2, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: ; X64: # BB#0: ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15] -; X64-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; X64-NEXT: vmovapd %zmm1, %zmm0 +; X64-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; X64-NEXT: vmovapd %zmm2, %zmm0 ; X64-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 -1) %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %res0, <8 x double> %res0, i8 -1) @@ -1044,15 +1044,15 @@ ; X32-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: ; X32: # BB#0: ; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] -; X32-NEXT: vpermt2w %zmm0, %zmm2, %zmm1 -; X32-NEXT: vmovdqa64 %zmm1, %zmm0 +; X32-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; X32-NEXT: vmovdqa64 %zmm2, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: ; X64: # BB#0: ; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] -; X64-NEXT: vpermt2w %zmm0, %zmm2, %zmm1 -; X64-NEXT: vmovdqa64 %zmm1, %zmm0 +; X64-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ; X64-NEXT: retq %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %x0, <32 x i16> %x1, i32 -1) %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> , <32 x i16> %res0, i32 -1) Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -23,18 +23,18 @@ ; X32: # BB#0: ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqu16 {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 {%k1} {z} -; X32-NEXT: vmovdqu16 {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X32-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} +; X32-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z} +; X32-NEXT: vmovdqu16 {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X32-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16i16_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovdqu16 {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 {%k1} {z} -; X64-NEXT: vmovdqu16 {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} +; X64-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z} +; X64-NEXT: vmovdqu16 {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> , <16 x i16> %x0, <16 x i16> %x1, i16 %m) %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> , <16 x i16> %res0, <16 x i16> %res0, i16 %m) Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -38,18 +38,18 @@ ; X32: # BB#0: ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqu8 {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X32-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 {%k1} {z} -; X32-NEXT: vmovdqu8 {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X32-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z} +; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z} +; X32-NEXT: vmovdqu8 {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16i8_identity_mask: ; X64: # BB#0: ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovdqu8 {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 {%k1} {z} -; X64-NEXT: vmovdqu8 {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z} +; X64-NEXT: vmovdqu8 {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> , <16 x i8> %x0, <16 x i8> %x1, i16 %m) %res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> , <16 x i8> %res0, <16 x i8> %res0, i16 %m) @@ -109,8 +109,7 @@ ; X32-NEXT: vmovdqu8 {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17] ; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 ; X32-NEXT: vmovdqu8 {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29] -; X32-NEXT: vpermt2b %xmm2, %xmm0, %xmm2 -; X32-NEXT: vmovdqa64 %xmm2, %xmm0 +; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2: @@ -118,8 +117,7 @@ ; X64-NEXT: vmovdqu8 {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17] ; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 ; X64-NEXT: vmovdqu8 {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29] -; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm2 -; X64-NEXT: vmovdqa64 %xmm2, %xmm0 +; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 ; X64-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> , <16 x i8> %x1, i16 -1) %res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> , <16 x i8> %res0, <16 x i8> %res0, i16 -1) Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll @@ -105,8 +105,8 @@ ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vpslld $31, %zmm3, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 @@ -119,8 +119,8 @@ ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; VL_BW_DQ-NEXT: vpmovd2m %zmm1, %k0 +; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %a2 = icmp eq <16 x i32> %a, %a1 @@ -189,8 +189,8 @@ ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -201,8 +201,8 @@ ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> -; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -245,8 +245,8 @@ ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -257,8 +257,8 @@ ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] -; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -307,8 +307,8 @@ ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -321,8 +321,8 @@ ; VL_BW_DQ-NEXT: vpmovm2q %k1, %zmm0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -340,8 +340,8 @@ ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq