Index: lib/Target/X86/MCTargetDesc/X86BaseInfo.h =================================================================== --- lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -767,6 +767,16 @@ return (reg == X86::SPL || reg == X86::BPL || reg == X86::SIL || reg == X86::DIL); } + + /// isKMasked - Is this a masked instruction. + inline bool isKMasked(uint64_t TSFlags) { + return (TSFlags & X86II::EVEX_K) != 0; + } + + /// isKMergedMasked - Is this a merge masked instruction. + inline bool isKMergeMasked(uint64_t TSFlags) { + return isKMasked(TSFlags) && (TSFlags & X86II::EVEX_Z) == 0; + } } } // end namespace llvm; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -8215,14 +8215,14 @@ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT _.RC:$src3), - (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V; + (i8 imm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V; defm rmi : AVX512_maskable_3src, + (i8 imm:$src4)), 1, 0>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; defm rmbi : AVX512_maskable_3src, EVEX_B, + (i8 imm:$src4)), 1, 0>, EVEX_B, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; }// Constraints = "$src1 = $dst" } Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -584,6 +584,25 @@ /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr &MI, unsigned int Op, int &FrameIndex) const; + + /// Returns true iff the routine could find two commutable operands in the + /// given machine instruction with 3 vector inputs. + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their + /// input values can be re-defined in this method only if the input values + /// are not pre-defined, which is designated by the special value + /// 'CommuteAnyOperandIndex' assigned to it. + /// If both of indices are pre-defined and refer to some operands, then the + /// method simply returns true if the corresponding operands are commutable + /// and returns false otherwise. + /// + /// For example, calling this method this way: + /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex; + /// findThreeSrcCommutedOpIndices(MI, Op1, Op2); + /// can be interpreted as a query asking to find an operand that would be + /// commutable with the operand#1. + bool findThreeSrcCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const; }; } // End llvm namespace Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -3318,30 +3318,21 @@ return NewMI; } -unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( - const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, - const X86InstrFMA3Group &FMA3Group) const { - - unsigned Opc = MI.getOpcode(); - +/// This determines which of three possible cases of a three source commute +/// the source indexes correspond to taking into account any mask operands. +/// All prevents commuting a passthru operand. Returns -1 if the commute isn't +/// possible. +static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, + unsigned SrcOpIdx2) { // Put the lowest index to SrcOpIdx1 to simplify the checks below. if (SrcOpIdx1 > SrcOpIdx2) std::swap(SrcOpIdx1, SrcOpIdx2); - // TODO: Commuting the 1st operand of FMA*_Int requires some additional - // analysis. The commute optimization is legal only if all users of FMA*_Int - // use only the lowest element of the FMA*_Int instruction. Such analysis are - // not implemented yet. So, just return 0 in that case. - // When such analysis are available this place will be the right place for - // calling it. - if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1) - return 0; - - unsigned FMAOp1 = 1, FMAOp2 = 2, FMAOp3 = 3; - if (FMA3Group.isKMasked()) { + unsigned Op1 = 1, Op2 = 2, Op3 = 3; + if (X86II::isKMasked(TSFlags)) { // The k-mask operand cannot be commuted. if (SrcOpIdx1 == 2) - return 0; + return -1; // For k-zero-masked operations it is Ok to commute the first vector // operand. @@ -3356,20 +3347,43 @@ // : v1[i]; // VMOVAPSZmrk , k, v4; // this is the ONLY user of v4 -> // // Ok, to commute v1 in FMADD213PSZrk. - if (FMA3Group.isKMergeMasked() && SrcOpIdx1 == FMAOp1) - return 0; - FMAOp2++; - FMAOp3++; + if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1) + return -1; + Op2++; + Op3++; } - unsigned Case; - if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp2) - Case = 0; - else if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp3) - Case = 1; - else if (SrcOpIdx1 == FMAOp2 && SrcOpIdx2 == FMAOp3) - Case = 2; - else + if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2) + return 0; + if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3) + return 1; + if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3) + return 2; + return -1; +} + +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( + const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const { + + unsigned Opc = MI.getOpcode(); + + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); + + // TODO: Commuting the 1st operand of FMA*_Int requires some additional + // analysis. The commute optimization is legal only if all users of FMA*_Int + // use only the lowest element of the FMA*_Int instruction. Such analysis are + // not implemented yet. So, just return 0 in that case. + // When such analysis are available this place will be the right place for + // calling it. + if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1) + return 0; + + // Determine which case this commute is or if it can't be done. + int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2); + if (Case < 0) return 0; // Define the FMA forms mapping array that helps to map input FMA form @@ -3416,6 +3430,36 @@ return FMAForms[FormIndex]; } +static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, + unsigned SrcOpIdx2) { + uint64_t TSFlags = MI.getDesc().TSFlags; + + // Determine which case this commute is or if it can't be done. + int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2); + if (Case < 0) + return false; + + // For each case we need to swap two pairs of bits in the final immediate. + static const uint8_t SwapMasks[3][4] = { + { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5. + { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6. + { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6. + }; + + uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm(); + // Clear out the bits we are swapping. + uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] | + SwapMasks[Case][2] | SwapMasks[Case][3]); + // If the immediate had a bit of the pair set, then set the opposite bit. + if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1]; + if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0]; + if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3]; + if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2]; + MI.getOperand(MI.getNumOperands()-1).setImm(NewImm); + + return true; +} + MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { @@ -3680,6 +3724,30 @@ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: + case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: + case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: + case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: + case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: + case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: + case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik: + case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik: + case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik: + case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik: + case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik: + case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik: + case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: + case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: + case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: + case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: + case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: + case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: { + auto &WorkingMI = cloneIfNew(MI); + if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2)) + return nullptr; + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } default: const X86InstrFMA3Group *FMA3Group = X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); @@ -3701,16 +3769,30 @@ bool X86InstrInfo::findFMA3CommutedOpIndices( const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const { + + if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2)) + return false; + + // Check if we can adjust the opcode to preserve the semantics when + // commute the register operands. + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0; +} + +bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + uint64_t TSFlags = MI.getDesc().TSFlags; + unsigned FirstCommutableVecOp = 1; unsigned LastCommutableVecOp = 3; unsigned KMaskOp = 0; - if (FMA3Group.isKMasked()) { + if (X86II::isKMasked(TSFlags)) { // The k-mask operand has index = 2 for masked and zero-masked operations. KMaskOp = 2; // The operand with index = 1 is used as a source for those elements for // which the corresponding bit in the k-mask is set to 0. - if (FMA3Group.isKMergeMasked()) + if (X86II::isKMergeMasked(TSFlags)) FirstCommutableVecOp = 3; LastCommutableVecOp++; @@ -3775,9 +3857,7 @@ return false; } - // Check if we can adjust the opcode to preserve the semantics when - // commute the register operands. - return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0; + return true; } bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, @@ -3819,6 +3899,25 @@ } return false; } + case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: + case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: + case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: + case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: + case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: + case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: + case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik: + case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik: + case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik: + case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik: + case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik: + case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik: + case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: + case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: + case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: + case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: + case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: + case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: + return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); default: const X86InstrFMA3Group *FMA3Group = X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); Index: test/CodeGen/X86/avx512-vpternlog-commute.ll =================================================================== --- test/CodeGen/X86/avx512-vpternlog-commute.ll +++ test/CodeGen/X86/avx512-vpternlog-commute.ll @@ -18,8 +18,7 @@ define <16 x i32> @vpternlog_v16i32_102(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_102: ; CHECK: ## BB#0: -; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $9, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1) ret <16 x i32> %res @@ -28,8 +27,8 @@ define <16 x i32> @vpternlog_v16i32_210(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_210: ; CHECK: ## BB#0: -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $9, %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1) ret <16 x i32> %res @@ -38,9 +37,7 @@ define <16 x i32> @vpternlog_v16i32_012_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_012_load0: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm0, %zmm2 -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) @@ -50,8 +47,7 @@ define <16 x i32> @vpternlog_v16i32_012_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_012_load1: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) @@ -71,8 +67,7 @@ define <16 x i32> @vpternlog_v16i32_102_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_102_load0: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1) @@ -82,9 +77,7 @@ define <16 x i32> @vpternlog_v16i32_102_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_102_load1: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm0, %zmm2 -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1) @@ -94,8 +87,7 @@ define <16 x i32> @vpternlog_v16i32_102_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) { ; CHECK-LABEL: vpternlog_v16i32_102_load2: ; CHECK: ## BB#0: -; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1) @@ -105,8 +97,7 @@ define <16 x i32> @vpternlog_v16i32_210_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_210_load0: ; CHECK: ## BB#0: -; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1) @@ -116,9 +107,7 @@ define <16 x i32> @vpternlog_v16i32_210_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_210_load1: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm2, %zmm1 -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1) @@ -128,9 +117,7 @@ define <16 x i32> @vpternlog_v16i32_210_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) { ; CHECK-LABEL: vpternlog_v16i32_210_load2: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1) @@ -140,9 +127,7 @@ define <16 x i32> @vpternlog_v16i32_021_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_021_load0: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 -1) @@ -162,8 +147,7 @@ define <16 x i32> @vpternlog_v16i32_021_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) { ; CHECK-LABEL: vpternlog_v16i32_021_load2: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 -1) @@ -219,8 +203,7 @@ ; CHECK-LABEL: vpternlog_v16i32_012_load1_mask: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask) @@ -242,8 +225,7 @@ ; CHECK-LABEL: vpternlog_v16i32_102_load0_mask: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask) @@ -291,8 +273,7 @@ ; CHECK-LABEL: vpternlog_v16i32_210_load1_mask: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr @@ -341,8 +322,7 @@ ; CHECK-LABEL: vpternlog_v16i32_021_load2_mask: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask) @@ -363,8 +343,7 @@ ; CHECK-LABEL: vpternlog_v16i32_102_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm0, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $9, %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask) ret <16 x i32> %res @@ -374,8 +353,8 @@ ; CHECK-LABEL: vpternlog_v16i32_210_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $9, %zmm0, %zmm2, %zmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask) ret <16 x i32> %res @@ -385,9 +364,7 @@ ; CHECK-LABEL: vpternlog_v16i32_012_load0_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask) @@ -398,8 +375,7 @@ ; CHECK-LABEL: vpternlog_v16i32_012_load1_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask) @@ -421,8 +397,7 @@ ; CHECK-LABEL: vpternlog_v16i32_102_load0_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask) @@ -433,9 +408,7 @@ ; CHECK-LABEL: vpternlog_v16i32_102_load1_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask) @@ -446,8 +419,7 @@ ; CHECK-LABEL: vpternlog_v16i32_102_load2_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm0, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask) @@ -458,8 +430,7 @@ ; CHECK-LABEL: vpternlog_v16i32_210_load0_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm0, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask) @@ -470,9 +441,7 @@ ; CHECK-LABEL: vpternlog_v16i32_210_load1_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm2, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask) @@ -483,9 +452,7 @@ ; CHECK-LABEL: vpternlog_v16i32_210_load2_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask) @@ -496,9 +463,7 @@ ; CHECK-LABEL: vpternlog_v16i32_021_load0_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2 {%k1} {z} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask) @@ -520,8 +485,7 @@ ; CHECK-LABEL: vpternlog_v16i32_021_load2_maskz: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)