Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -1327,14 +1327,14 @@ defm rr: AVX512_maskable_3src, EVEX_4V, + (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V, AVX5128IBase; defm rm: AVX512_maskable_3src, + (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>, EVEX_4V, AVX5128IBase; } } @@ -1346,8 +1346,8 @@ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermi2X _.RC:$src1, - _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, - AVX5128IBase, EVEX_4V, EVEX_B; + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), + 1>, AVX5128IBase, EVEX_4V, EVEX_B; } multiclass avx512_perm_i_sizes opc, string OpcodeStr, @@ -1395,14 +1395,14 @@ defm rr: AVX512_maskable_3src, EVEX_4V, - AVX5128IBase; + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>, + EVEX_4V, AVX5128IBase; defm rm: AVX512_maskable_3src, + (bitconvert (_.LdFrag addr:$src3)))), 1>, EVEX_4V, AVX5128IBase; } } @@ -1414,8 +1414,8 @@ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src1, - IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, - AVX5128IBase, EVEX_4V, EVEX_B; + IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), + 1>, AVX5128IBase, EVEX_4V, EVEX_B; } multiclass avx512_perm_t_sizes opc, string OpcodeStr, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -3532,6 +3532,92 @@ return true; } +// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be +// commuted. +static bool isCommutableVPERMV3Instruction(unsigned Opcode) { +#define VPERM_CASES(Suffix) \ + case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \ + case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \ + case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \ + case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \ + case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \ + case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \ + case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \ + case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \ + case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \ + case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \ + case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \ + case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz: + +#define VPERM_CASES_BROADCAST(Suffix) \ + VPERM_CASES(Suffix) \ + case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \ + case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \ + case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \ + case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \ + case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \ + case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz: + + switch (Opcode) { + default: return false; + VPERM_CASES(B) + VPERM_CASES_BROADCAST(D) + VPERM_CASES_BROADCAST(PD) + VPERM_CASES_BROADCAST(PS) + VPERM_CASES_BROADCAST(Q) + VPERM_CASES(W) + return true; + } +#undef VPERM_CASES_BROADCAST +#undef VPERM_CASES +} + +// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching +// from the I opcod to the T opcode and vice versa. +static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) { +#define VPERM_CASES(Orig, New) \ + case X86::Orig##128rr: return X86::New##128rr; \ + case X86::Orig##128rrkz: return X86::New##128rrkz; \ + case X86::Orig##128rm: return X86::New##128rm; \ + case X86::Orig##128rmkz: return X86::New##128rmkz; \ + case X86::Orig##256rr: return X86::New##256rr; \ + case X86::Orig##256rrkz: return X86::New##256rrkz; \ + case X86::Orig##256rm: return X86::New##256rm; \ + case X86::Orig##256rmkz: return X86::New##256rmkz; \ + case X86::Orig##rr: return X86::New##rr; \ + case X86::Orig##rrkz: return X86::New##rrkz; \ + case X86::Orig##rm: return X86::New##rm; \ + case X86::Orig##rmkz: return X86::New##rmkz; + +#define VPERM_CASES_BROADCAST(Orig, New) \ + VPERM_CASES(Orig, New) \ + case X86::Orig##128rmb: return X86::New##128rmb; \ + case X86::Orig##128rmbkz: return X86::New##128rmbkz; \ + case X86::Orig##256rmb: return X86::New##256rmb; \ + case X86::Orig##256rmbkz: return X86::New##256rmbkz; \ + case X86::Orig##rmb: return X86::New##rmb; \ + case X86::Orig##rmbkz: return X86::New##rmbkz; + + switch (Opcode) { + VPERM_CASES(VPERMI2B, VPERMT2B) + VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D) + VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD) + VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS) + VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q) + VPERM_CASES(VPERMI2W, VPERMT2W) + VPERM_CASES(VPERMT2B, VPERMI2B) + VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D) + VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD) + VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS) + VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q) + VPERM_CASES(VPERMT2W, VPERMI2W) + } + + llvm_unreachable("Unreachable!"); +#undef VPERM_CASES_BROADCAST +#undef VPERM_CASES +} + MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { @@ -3853,7 +3939,15 @@ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } - default: + default: { + if (isCommutableVPERMV3Instruction(MI.getOpcode())) { + unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode()); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + const X86InstrFMA3Group *FMA3Group = X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); if (FMA3Group) { @@ -3869,6 +3963,7 @@ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } + } } bool X86InstrInfo::findFMA3CommutedOpIndices( Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -369,8 +369,8 @@ ; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} ; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] -; KNL-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm0 +; KNL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpslld $31, %zmm2, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq @@ -384,8 +384,8 @@ ; SKX-NEXT: vpmovm2d %k1, %zmm0 ; SKX-NEXT: vpmovm2d %k0, %zmm1 ; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] -; SKX-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; SKX-NEXT: vpmovd2m %zmm0, %k0 +; SKX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpmovd2m %zmm2, %k0 ; SKX-NEXT: kmovw %k0, %eax ; SKX-NEXT: retq %x = load i1 , i1 * %addr, align 128 @@ -406,8 +406,8 @@ ; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} ; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] -; KNL-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; KNL-NEXT: vpsllq $63, %zmm1, %zmm0 +; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq @@ -421,8 +421,8 @@ ; SKX-NEXT: vpmovm2q %k1, %zmm0 ; SKX-NEXT: vpmovm2q %k0, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] -; SKX-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; SKX-NEXT: vpmovq2m %zmm0, %k0 +; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpmovq2m %zmm2, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: retq %x = load i1 , i1 * %addr, align 128 @@ -1217,8 +1217,8 @@ ; SKX-NEXT: vpmovm2w %k1, %zmm0 ; SKX-NEXT: vpmovm2w %k0, %zmm1 ; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] -; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 -; SKX-NEXT: vpmovw2m %zmm0, %k0 +; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpmovw2m %zmm2, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: retq %cmp_res_i1 = icmp ult i32 %a, %b @@ -1249,14 +1249,14 @@ ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7] -; KNL-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; KNL-NEXT: vpsllq $63, %zmm3, %zmm2 +; KNL-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; KNL-NEXT: vpsllq $63, %zmm4, %zmm2 ; KNL-NEXT: vptestmq %zmm2, %zmm2, %k2 ; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z} ; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7] -; KNL-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 -; KNL-NEXT: vpsllq $63, %zmm2, %zmm2 +; KNL-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; KNL-NEXT: vpsllq $63, %zmm4, %zmm2 ; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z} ; KNL-NEXT: vpextrd $3, %xmm0, %eax @@ -1264,8 +1264,8 @@ ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7] -; KNL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 +; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq @@ -1310,8 +1310,8 @@ ; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} ; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; KNL-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; KNL-NEXT: vpsllq $63, %zmm1, %zmm0 +; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -643,8 +643,8 @@ ; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; KNL-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: kshiftrw $1, %k1, %k1 @@ -665,8 +665,8 @@ ; SKX-NEXT: vpmovm2q %k0, %zmm0 ; SKX-NEXT: vpmovm2q %k1, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] -; SKX-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; SKX-NEXT: vpmovq2m %zmm0, %k0 +; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpmovq2m %zmm2, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 ; SKX-NEXT: kshiftrb $1, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k2, %k1 Index: test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-512.ll +++ test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -138,19 +138,19 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovupd 8(%rdi), %zmm0 -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vmovapd {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7> -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vmovupd 8(%rdi), %zmm1 +; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; ALL-NEXT: vmovapd {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7> +; ALL-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 -; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; X32-AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> -; X32-AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm1 +; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; X32-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> +; X32-AVX512F-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3 @@ -225,19 +225,19 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_i64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7> -; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm1 +; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; ALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7> +; ALL-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 -; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> -; X32-AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm1 +; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> +; X32-AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3 @@ -334,19 +334,19 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; ALL: # BB#0: -; ALL-NEXT: vmovups (%rdi), %zmm0 -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vmovaps {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vmovups (%rdi), %zmm1 +; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovups (%eax), %zmm0 -; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; X32-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X32-AVX512F-NEXT: vmovups (%eax), %zmm1 +; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; X32-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3 @@ -448,19 +448,19 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu32 (%rdi), %zmm0 -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vmovdqu32 (%rdi), %zmm1 +; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; ALL-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; ALL-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 -; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; X32-AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm1 +; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; X32-AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -262,15 +262,15 @@ ; AVX512F-LABEL: shuffle_v8f64_8823cc67: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_8823cc67: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -281,15 +281,15 @@ ; AVX512F-LABEL: shuffle_v8f64_9832dc76: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_9832dc76: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -300,15 +300,15 @@ ; AVX512F-LABEL: shuffle_v8f64_9810dc54: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_9810dc54: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -370,15 +370,15 @@ ; AVX512F-LABEL: shuffle_v8f64_08991abb: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_08991abb: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -406,15 +406,15 @@ ; AVX512F-LABEL: shuffle_v8f64_09ab1def: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_09ab1def: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -927,15 +927,15 @@ ; AVX512F-LABEL: shuffle_v8f64_c348cda0: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8] -; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_c348cda0: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,11,0,12,0,0,0,4,0,5,0,2,0,8,0] -; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -1180,15 +1180,15 @@ ; AVX512F-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1233,15 +1233,15 @@ ; AVX512F-LABEL: shuffle_v8i64_8823cc67: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_8823cc67: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1252,15 +1252,15 @@ ; AVX512F-LABEL: shuffle_v8i64_9832dc76: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_9832dc76: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1271,15 +1271,15 @@ ; AVX512F-LABEL: shuffle_v8i64_9810dc54: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_9810dc54: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1341,15 +1341,15 @@ ; AVX512F-LABEL: shuffle_v8i64_08991abb: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_08991abb: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1377,15 +1377,15 @@ ; AVX512F-LABEL: shuffle_v8i64_09ab1def: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_09ab1def: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1914,15 +1914,15 @@ ; AVX512F-LABEL: shuffle_v8i64_6caa87e5: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_6caa87e5: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,0,4,0,2,0,2,0,0,0,15,0,6,0,13,0] -; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle Index: test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -190,9 +190,9 @@ ; X32-LABEL: combine_vpermt2var_8i64_identity: ; X32: # BB#0: ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; X32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; X32-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 +; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; X32-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_8i64_identity: @@ -280,17 +280,17 @@ ; X32-LABEL: combine_vpermt2var_16f32_vmovddup_load: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps (%eax), %zmm1 -; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] -; X32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; X32-NEXT: vmovaps (%eax), %zmm2 +; X32-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; X32-NEXT: vmovaps %zmm1, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load: ; X64: # BB#0: -; X64-NEXT: vmovaps (%rdi), %zmm1 -; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] -; X64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; X64-NEXT: vmovaps (%rdi), %zmm2 +; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; X64-NEXT: vmovaps %zmm1, %zmm0 ; X64-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -105,8 +105,8 @@ ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vpslld $31, %zmm3, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 @@ -119,8 +119,8 @@ ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; VL_BW_DQ-NEXT: vpmovd2m %zmm1, %k0 +; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %a2 = icmp eq <16 x i32> %a, %a1 @@ -189,8 +189,8 @@ ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -201,8 +201,8 @@ ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> -; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -245,8 +245,8 @@ ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -257,8 +257,8 @@ ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] -; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -307,8 +307,8 @@ ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -321,8 +321,8 @@ ; VL_BW_DQ-NEXT: vpmovm2q %k1, %zmm0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -340,8 +340,8 @@ ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq