Diff 83689

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,678 Lines • ▼ Show 20 Lines	static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
// We have to cast V2 around.		// We have to cast V2 around.
MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);		MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,		V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
DAG.getBitcast(MaskVT, V1Mask),		DAG.getBitcast(MaskVT, V1Mask),
DAG.getBitcast(MaskVT, V2)));		DAG.getBitcast(MaskVT, V2)));
return DAG.getNode(ISD::OR, DL, VT, V1, V2);		return DAG.getNode(ISD::OR, DL, VT, V1, V2);
}		}

		static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
		const X86Subtarget &Subtarget, SelectionDAG &DAG,
		const SDLoc &dl);

/// \brief Try to emit a blend instruction for a shuffle.		/// \brief Try to emit a blend instruction for a shuffle.
///		///
/// This doesn't do any checks for the availability of instructions for blending		/// This doesn't do any checks for the availability of instructions for blending
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to		/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is		/// be matched in the backend with the type given. What it does check for is
/// that the shuffle mask is a blend, or convertible into a blend with zero.		/// that the shuffle mask is a blend, or convertible into a blend with zero.
static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,		static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Original,		SDValue V2, ArrayRef<int> Original,
▲ Show 20 Lines • Show All 142 Lines • ▼ Show 20 Lines	for (int i = 0, Size = Mask.size(); i < Size; ++i)
MVT::i8));		MVT::i8));

V1 = DAG.getBitcast(BlendVT, V1);		V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);		V2 = DAG.getBitcast(BlendVT, V2);
return DAG.getBitcast(		return DAG.getBitcast(
VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,		VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));		DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
}		}
		case MVT::v16f32:
		case MVT::v8i64:
		case MVT::v16i32:
		case MVT::v32i16:
		case MVT::v64i8: {
		MVT IntegerType =
		MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
		SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
		unsigned NumElts = VT.getVectorNumElements();
		SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
		igorbUnsubmitted Done Reply Inline Actions you can use getVectorMaskingNode to simplify the code, all logic already implemented. igorb: you can use getVectorMaskingNode to simplify the code, all logic already implemented.
		Subtarget, DAG, DL);
		SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
		igorbUnsubmitted Done Reply Inline Actions not in use. igorb: not in use.
		return DAG.getNode(ISD::VSELECT, DL, VT, VMask, V1, V2);
		}
default:		default:
llvm_unreachable("Not a supported integer vector type!");		llvm_unreachable("Not a supported integer vector type!");
}		}
}		}

/// \brief Try to lower as a blend of elements from two inputs followed by		/// \brief Try to lower as a blend of elements from two inputs followed by
/// a single-input permutation.		/// a single-input permutation.
///		///
▲ Show 20 Lines • Show All 4,347 Lines • ▼ Show 20 Lines	if (SDValue Op =
lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))		lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Op;		return Op;

return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);		return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}		}

/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.		/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,		static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
		const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,		SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,		const X86Subtarget &Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");		assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");		assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");		assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

// If the shuffle mask is repeated in each 128-bit lane, we have many more		// If the shuffle mask is repeated in each 128-bit lane, we have many more
Show All 12 Lines	if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,		return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));		getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

// Use dedicated unpack instructions for masks that match their pattern.		// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue Unpck =		if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))		lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
return Unpck;		return Unpck;

		if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
		Zeroable, Subtarget, DAG))
		return Blend;

// Otherwise, fall back to a SHUFPS sequence.		// Otherwise, fall back to a SHUFPS sequence.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);		return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}		}

return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);		return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}		}

/// \brief Handle lowering of 8-lane 64-bit integer shuffles.		/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,		if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
Mask, Subtarget, DAG))		Mask, Subtarget, DAG))
return Rotate;		return Rotate;

if (SDValue Unpck =		if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))		lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;		return Unpck;

		if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
		Zeroable, Subtarget, DAG))
		return Blend;

return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);		return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}		}

/// \brief Handle lowering of 16-lane 32-bit integer shuffles.		/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,		static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
const SmallBitVector &Zeroable,		const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,		SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,		const X86Subtarget &Subtarget,
Show All 30 Lines	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))		Zeroable, Subtarget, DAG))
return Shift;		return Shift;

// Try to use VALIGN.		// Try to use VALIGN.
if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,		if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
Mask, Subtarget, DAG))		Mask, Subtarget, DAG))
return Rotate;		return Rotate;

		if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
		Zeroable, Subtarget, DAG))
		return Blend;

// Try to use byte rotation instructions.		// Try to use byte rotation instructions.
if (Subtarget.hasBWI())		if (Subtarget.hasBWI())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(		if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))		DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;		return Rotate;

return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);		return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}		}
Show All 37 Lines	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
// As this is a single-input shuffle, the repeated mask should be		// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16		// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v32 case.		// lowering to handle even the v32 case.
return lowerV8I16GeneralSingleInputVectorShuffle(		return lowerV8I16GeneralSingleInputVectorShuffle(
DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);		DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
}		}
}		}

		if (Subtarget.hasBWI())
		RKSimonUnsubmitted Done Reply Inline Actions Unnecessary as AVX512BW is a requirement for v32i16 - see the assert at the top of function. RKSimon: Unnecessary as AVX512BW is a requirement for v32i16 - see the assert at the top of function.
		if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
		Zeroable, Subtarget, DAG))
		return Blend;

return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);		return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}		}

/// \brief Handle lowering of 64-lane 8-bit integer shuffles.		/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,		static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
const SmallBitVector &Zeroable,		const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,		SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,		const X86Subtarget &Subtarget,
Show All 24 Lines	static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(		if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))		DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return Rotate;		return Rotate;

if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(		if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))		DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
return PSHUFB;		return PSHUFB;

		if (Subtarget.hasBWI())
		RKSimonUnsubmitted Done Reply Inline Actions Unnecessary as AVX512BW is a requirement for v64i8 - see the assert at the top of function. RKSimon: Unnecessary as AVX512BW is a requirement for v64i8 - see the assert at the top of function.
		if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
		Zeroable, Subtarget, DAG))
		return Blend;

// VBMI can use VPERMV/VPERMV3 byte shuffles.		// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())		if (Subtarget.hasVBMI())
return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);		return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

// FIXME: Implement direct support for this type!		// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);		return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}		}

Show All 28 Lines	static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Dispatch to each element type for lowering. If we don't have support for		// Dispatch to each element type for lowering. If we don't have support for
// specific element type shuffles at 512 bits, immediately split them and		// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that		// lower them. Each lowering routine of a given type is allowed to assume that
// the requisite ISA extensions for that element type are available.		// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {		switch (VT.SimpleTy) {
case MVT::v8f64:		case MVT::v8f64:
return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);		return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v16f32:		case MVT::v16f32:
return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);		return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i64:		case MVT::v8i64:
return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);		return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i32:		case MVT::v16i32:
return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);		return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i16:		case MVT::v32i16:
return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);		return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v64i8:		case MVT::v64i8:
return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);		return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
▲ Show 20 Lines • Show All 20,961 Lines • Show Last 20 Lines

test/CodeGen/X86/merge-consecutive-loads-512.ll

Show First 20 Lines • Show All 219 Lines • ▼ Show 20 Lines	; X32-AVX512F-NEXT: retl
%res6 = insertelement <8 x i64> %res4, i64 0, i32 6		%res6 = insertelement <8 x i64> %res4, i64 0, i32 6
%res7 = insertelement <8 x i64> %res6, i64 0, i32 7		%res7 = insertelement <8 x i64> %res6, i64 0, i32 7
ret <8 x i64> %res7		ret <8 x i64> %res7
}		}

define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {		define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8i64_i64_1u3u5zu8:		; ALL-LABEL: merge_8i64_i64_1u3u5zu8:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vmovdqu64 8(%rdi), %zmm1		; ALL-NEXT: movb $32, %al
; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2		; ALL-NEXT: kmovw %eax, %k1
; ALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7>		; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
; ALL-NEXT: vpermi2q %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq		; ALL-NEXT: retq
;		;
; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:		; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
; X32-AVX512F: # BB#0:		; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-AVX512F-NEXT: movl 4(%esp), %eax
; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm1		; X32-AVX512F-NEXT: movb $32, %cl
; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2		; X32-AVX512F-NEXT: kmovw %ecx, %k1
; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>		; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 {%k1} {z}
; X32-AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm0
; X32-AVX512F-NEXT: retl		; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1		%ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
%ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3		%ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
%ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5		%ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5
%ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8		%ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8
%val0 = load i64, i64* %ptr0		%val0 = load i64, i64* %ptr0
%val2 = load i64, i64* %ptr2		%val2 = load i64, i64* %ptr2
%val4 = load i64, i64* %ptr4		%val4 = load i64, i64* %ptr4
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	; X32-AVX512F-NEXT: retl
%resE = insertelement <16 x i32> %resC, i32 %valE, i32 14		%resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
%resF = insertelement <16 x i32> %resE, i32 %valF, i32 15		%resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
ret <16 x i32> %resF		ret <16 x i32> %resF
}		}

define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {		define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:		; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vmovdqu32 (%rdi), %zmm1		; ALL-NEXT: movw $8240, %ax # imm = 0x2030
; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2		; ALL-NEXT: kmovw %eax, %k1
; ALL-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>		; ALL-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; ALL-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq		; ALL-NEXT: retq
;		;
; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:		; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; X32-AVX512F: # BB#0:		; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-AVX512F-NEXT: movl 4(%esp), %eax
; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm1		; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030
; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2		; X32-AVX512F-NEXT: kmovw %ecx, %k1
; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>		; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z}
; X32-AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
; X32-AVX512F-NEXT: retl		; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0		%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
%ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3		%ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
%ptrC = getelementptr inbounds i32, i32* %ptr, i64 12		%ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
%ptrE = getelementptr inbounds i32, i32* %ptr, i64 14		%ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
%ptrF = getelementptr inbounds i32, i32* %ptr, i64 15		%ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
%val0 = load i32, i32* %ptr0		%val0 = load i32, i32* %ptr0
%val3 = load i32, i32* %ptr3		%val3 = load i32, i32* %ptr3
▲ Show 20 Lines • Show All 247 Lines • Show Last 20 Lines

test/CodeGen/X86/sse3-avx-addsub.ll

	Show First 20 Lines • Show All 113 Lines • ▼ Show 20 Lines
	; AVX1-LABEL: test5:			; AVX1-LABEL: test5:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vaddsubps %ymm2, %ymm0, %ymm0			; AVX1-NEXT: vaddsubps %ymm2, %ymm0, %ymm0
	; AVX1-NEXT: vaddsubps %ymm3, %ymm1, %ymm1			; AVX1-NEXT: vaddsubps %ymm3, %ymm1, %ymm1
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX512-LABEL: test5:			; AVX512-LABEL: test5:
	; AVX512: # BB#0:			; AVX512: # BB#0:
	; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2			; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2
	; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm0			; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA
	; AVX512-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm2[1,3],zmm0[4,6],zmm2[5,7],zmm0[8,10],zmm2[9,11],zmm0[12,14],zmm2[13,15]			; AVX512-NEXT: kmovw %eax, %k1
	; AVX512-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]			; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1}
				; AVX512-NEXT: vmovaps %zmm2, %zmm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%add = fadd <16 x float> %A, %B			%add = fadd <16 x float> %A, %B
	%sub = fsub <16 x float> %A, %B			%sub = fsub <16 x float> %A, %B
	%vecinit2 = shufflevector <16 x float> %sub, <16 x float> %add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>			%vecinit2 = shufflevector <16 x float> %sub, <16 x float> %add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
	ret <16 x float> %vecinit2			ret <16 x float> %vecinit2
	}			}

	define <8 x double> @test6(<8 x double> %A, <8 x double> %B) {			define <8 x double> @test6(<8 x double> %A, <8 x double> %B) {
	▲ Show 20 Lines • Show All 165 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-512-v16.ll

Show First 20 Lines • Show All 246 Lines • ▼ Show 20 Lines	; ALL-NEXT: retq
%c = load <16 x i32>, <16 x i32>* %b		%c = load <16 x i32>, <16 x i32>* %b
%d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>		%d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
ret <16 x i32> %d		ret <16 x i32> %d
}		}

define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b) {		define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:		; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,1,2,19,u,u,u,u,u,u,u,u,u,u,u,u>		; ALL-NEXT: movw $8, %ax
; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0		; ALL-NEXT: kmovw %eax, %k1
		; ALL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; ALL-NEXT: retq		; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>		%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i32> %c		ret <16 x i32> %c
}		}

define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {		define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
; ALL-LABEL: shuffle_v16f32_extract_256:		; ALL-LABEL: shuffle_v16f32_extract_256:
; ALL: # BB#0:		; ALL: # BB#0:
▲ Show 20 Lines • Show All 144 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-512-v32.ll

	Show First 20 Lines • Show All 104 Lines • ▼ Show 20 Lines
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>			%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>
	ret <32 x i16> %c			ret <32 x i16> %c
	}			}

	define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {			define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
	; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:			; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
	; ALL: # BB#0:			; ALL: # BB#0:
	; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = [32,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
	; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1			; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
	; ALL-NEXT: vpermt2w %zmm0, %zmm2, %zmm1			; ALL-NEXT: movl $1, %eax
	; ALL-NEXT: vmovdqa64 %zmm1, %zmm0			; ALL-NEXT: kmovd %eax, %k1
				; ALL-NEXT: vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
				RKSimonUnsubmitted Done Reply Inline Actions Any idea why this isn't using a blend with zero: _mm512_maskz_mov_epi16 ? RKSimon: Any idea why this isn't using a blend with zero: _mm512_maskz_mov_epi16 ?
				m_zuckermanAuthorUnsubmitted Not Done Reply Inline Actions Patterns were missing, This was changed in commit 291368 m_zuckerman: Patterns were missing, This was changed in commit 291368
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>			%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
	ret <32 x i16> %shuffle			ret <32 x i16> %shuffle
	}			}

	define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {			define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
	; ALL-LABEL: insert_dup_mem_v32i16_i32:			; ALL-LABEL: insert_dup_mem_v32i16_i32:
	; ALL: # BB#0:			; ALL: # BB#0:
	▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-512-v8.ll

Show First 20 Lines • Show All 1,173 Lines • ▼ Show 20 Lines	; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>		%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x i64> %shuffle		ret <8 x i64> %shuffle
}		}

define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {		define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
;		;
; AVX512F-LABEL: shuffle_v8i64_81a3c5e7:		; AVX512F-LABEL: shuffle_v8i64_81a3c5e7:
; AVX512F: # BB#0:		; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15]		; AVX512F-NEXT: movb $-86, %al
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2		; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0		; AVX512F-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
; AVX512F-NEXT: retq		; AVX512F-NEXT: retq
;		;
; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7:		; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7:
; AVX512F-32: # BB#0:		; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0]		; AVX512F-32-NEXT: movb $-86, %al
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2		; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0		; AVX512F-32-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
; AVX512F-32-NEXT: retl		; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>		%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x i64> %shuffle		ret <8 x i64> %shuffle
}		}

define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {		define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
;		;
; AVX512F-LABEL: shuffle_v8i64_08080808:		; AVX512F-LABEL: shuffle_v8i64_08080808:
▲ Show 20 Lines • Show All 1,188 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-to-blend-avx512.ll.ll

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx \| FileCheck %s --check-prefix=SKX
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl \| FileCheck %s --check-prefix=KNL

				igorbUnsubmitted Done Reply Inline Actions Could you please add tests for the all cases i8/i16/f64 igorb: Could you please add tests for the all cases i8/i16/f64
				define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){
				; SKX-LABEL: test_mm512_mask_blend_epi32:
				; SKX: # BB#0: # %entry
				; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA
				; SKX-NEXT: kmovw %eax, %k1
				; SKX-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
				; SKX-NEXT: retq
				;
				; KNL-LABEL: test_mm512_mask_blend_epi32:
				; KNL: # BB#0: # %entry
				; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA
				; KNL-NEXT: kmovw %eax, %k1
				; KNL-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
				; KNL-NEXT: retq
				entry:
				%0 = shufflevector <16 x i32> %A, <16 x i32> %W, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
				ret <16 x i32> %0
				}

				define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){
				; SKX-LABEL: test_mm512_mask_blend_epi64:
				; SKX: # BB#0: # %entry
				; SKX-NEXT: movb $-86, %al
				; SKX-NEXT: kmovb %eax, %k1
				; SKX-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
				; SKX-NEXT: retq
				;
				; KNL-LABEL: test_mm512_mask_blend_epi64:
				; KNL: # BB#0: # %entry
				; KNL-NEXT: movb $-86, %al
				; KNL-NEXT: kmovw %eax, %k1
				; KNL-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
				; KNL-NEXT: retq
				entry:
				%0 = shufflevector <8 x i64> %A, <8 x i64> %W, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
				ret <8 x i64> %0
				}

				define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){
				; SKX-LABEL: test_mm512_mask_blend_ps:
				; SKX: # BB#0: # %entry
				; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA
				; SKX-NEXT: kmovw %eax, %k1
				; SKX-NEXT: vblendmps %zmm1, %zmm0, %zmm0 {%k1}
				; SKX-NEXT: retq
				;
				; KNL-LABEL: test_mm512_mask_blend_ps:
				; KNL: # BB#0: # %entry
				; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA
				; KNL-NEXT: kmovw %eax, %k1
				; KNL-NEXT: vblendmps %zmm1, %zmm0, %zmm0 {%k1}
				; KNL-NEXT: retq
				entry:
				%0 = shufflevector <16 x float> %A, <16 x float> %W, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
				ret <16 x float> %0
				}

This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX512] Adding missing shuffle lowering to blend mask instructions (VPBLENDMB/VPBLENDMW/VPBLENDMD/VPBLENDMQ) .
ClosedPublic

Details

Diff Detail

Event Timeline

Script:

/build/build-ubsan/./bin/llc < /src/test/CodeGen/X86/vector-shuffle-avx512.ll -mtriple=x86_64-pc-linux-gnu -mcpu=knl | /build/build-ubsan/./bin/FileCheck /src/test/CodeGen/X86/vector-shuffle-avx512.ll --check-prefix=KNL

Command Output (stderr):

Revision Contents

Diff 83689

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/merge-consecutive-loads-512.ll

test/CodeGen/X86/sse3-avx-addsub.ll

test/CodeGen/X86/vector-shuffle-512-v16.ll

test/CodeGen/X86/vector-shuffle-512-v32.ll

test/CodeGen/X86/vector-shuffle-512-v8.ll

test/CodeGen/X86/vector-shuffle-to-blend-avx512.ll.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX512] Adding missing shuffle lowering to blend mask instructions (VPBLENDMB/VPBLENDMW/VPBLENDMD/VPBLENDMQ) . ClosedPublic

Details

Diff Detail

Event Timeline

Script:

/build/build-ubsan/./bin/llc < /src/test/CodeGen/X86/vector-shuffle-avx512.ll -mtriple=x86_64-pc-linux-gnu -mcpu=knl | /build/build-ubsan/./bin/FileCheck /src/test/CodeGen/X86/vector-shuffle-avx512.ll --check-prefix=KNL

Command Output (stderr):

Revision Contents

Diff 83689

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/merge-consecutive-loads-512.ll

test/CodeGen/X86/sse3-avx-addsub.ll

test/CodeGen/X86/vector-shuffle-512-v16.ll

test/CodeGen/X86/vector-shuffle-512-v32.ll

test/CodeGen/X86/vector-shuffle-512-v8.ll

test/CodeGen/X86/vector-shuffle-to-blend-avx512.ll.ll

[X86][AVX512] Adding missing shuffle lowering to blend mask instructions (VPBLENDMB/VPBLENDMW/VPBLENDMD/VPBLENDMQ) .
ClosedPublic