This is an archive of the discontinued LLVM Phabricator instance.

fix a logic bug in x86 vector codegen: sext (zext (x) ) != sext (x) (PR20472)
ClosedPublic

Authored by spatel on Aug 14 2014, 10:00 AM.

Download Raw Diff

Details

Reviewers

hliao
rengolin
chandlerc
nadav
• rafael
hfinkel

Commits

rG81ecbb0737d2: Fix a logic bug in x86 vector codegen: sext (zext (x) ) != sext (x) (PR20472).
rL216679: Fix a logic bug in x86 vector codegen: sext (zext (x) ) != sext (x) (PR20472).

Summary

[resending as new patch because I forgot to add llvm-commits initially]

As shown in InstCombine: sext ( zext ( x ) ) -> zext ( x ).

The code in X86ISelLowering that this patch proposes to remove mistakenly implemented the transform as:
sext ( zext ( x ) ) -> sext ( x )

Ie, what should be a zext output was turned into a sext.

I want to believe that the logic in the original patch has some value as an optimization for some other case and it's just not in the right place here. But the test cases from:
http://llvm.org/viewvc/llvm-project?view=revision&revision=177421
don't provide any evidence.

We just have miscompile bugs:
http://llvm.org/bugs/show_bug.cgi?id=20472 (please see this one for more of my analysis)
http://llvm.org/bugs/show_bug.cgi?id=18054

The testcases that I've added here confirm that we (1) don't remove a zext op that is necessary and (2) generate a pmovz instead of punpck if SSE4.1 is available. Although pmovz is 1 byte longer, it allows folding of the load, and so saves 3 bytes overall.

We don't need to call LowerVectorIntExtend() from LowerSIGN_EXTEND_INREG() to get this codegen either - that is already handled in NormalizeVectorShuffle().

Diff Detail

Repository: rL LLVM

Event Timeline

spatel updated this revision to Diff 12516.Aug 14 2014, 10:00 AM

spatel retitled this revision from to fix a logic bug in x86 vector codegen: sext (zext (x) ) != sext (x) (PR20472).

spatel updated this object.

spatel edited the test plan for this revision. (Show Details)

spatel added reviewers: nadav, chandlerc, hliao, • rafael.

spatel added a subscriber: Unknown Object (MLST).

Ping.

Ping 2.

This is a miscompile that apparently occurs in the wild.

Easy review - the fix doesn't add any lines of code other than comments. :)

LGTM!

Sanjay, you can commit changes like this without waiting for pre-commit code review (See item #3 in http://llvm.org/docs/DeveloperPolicy.html#code-reviews).

Thanks,
Nadav

Closed by commit rL216679 (authored by @spatel).

Thanks, Nadav!
Checked in at r216679.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

36 lines

test/

CodeGen/

X86/

vec_trunc_sext.ll

30 lines

Diff 13051

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 16,440 Lines • ▼ Show 20 Lines	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDValue SetCC =		SDValue SetCC =
DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),		DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
DAG.getConstant(Cond, MVT::i32),		DAG.getConstant(Cond, MVT::i32),
SDValue(Sum.getNode(), 1));		SDValue(Sum.getNode(), 1));

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);		return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}		}

		// Sign extension of the low part of vector elements. This may be used either
		// when sign extend instructions are not available or if the vector element
		// sizes already match the sign-extended size. If the vector elements are in
		// their pre-extended size and sign extend instructions are available, that will
		// be handled by LowerSIGN_EXTEND.
SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,		SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc dl(Op);		SDLoc dl(Op);
EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();		EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
MVT VT = Op.getSimpleValueType();		MVT VT = Op.getSimpleValueType();

if (!Subtarget->hasSSE2() \|\| !VT.isVector())		if (!Subtarget->hasSSE2() \|\| !VT.isVector())
return SDValue();		return SDValue();
Show All 29 Lines	case MVT::v16i16:
LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);		LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);		return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
}		}
// fall through		// fall through
case MVT::v4i32:		case MVT::v4i32:
case MVT::v8i16: {		case MVT::v8i16: {
SDValue Op0 = Op.getOperand(0);		SDValue Op0 = Op.getOperand(0);
SDValue Op00 = Op0.getOperand(0);
SDValue Tmp1;
// Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
if (Op0.getOpcode() == ISD::BITCAST &&
Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
// (sext (vzext x)) -> (vsext x)
Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
if (Tmp1.getNode()) {
EVT ExtraEltVT = ExtraVT.getVectorElementType();
// This folding is only valid when the in-reg type is a vector of i8,
// i16, or i32.
if (ExtraEltVT == MVT::i8 \|\| ExtraEltVT == MVT::i16 \|\|
ExtraEltVT == MVT::i32) {
SDValue Tmp1Op0 = Tmp1.getOperand(0);
assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
"This optimization is invalid without a VZEXT.");
return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
}
Op0 = Tmp1;
}
}

// If the above didn't work, then just use Shift-Left + Shift-Right.		// This is a sign extension of some low part of vector elements without
Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,		// changing the size of the vector elements themselves:
DAG);		// Shift-Left + Shift-Right-Algebraic.
return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,		SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
		BitsDiff, DAG);
		return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
DAG);		DAG);
}		}
}		}
}		}

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,		static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
SDLoc dl(Op);		SDLoc dl(Op);
▲ Show 20 Lines • Show All 7,060 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vec_trunc_sext.ll

				; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='-sse4.1' -o - \| FileCheck %s -check-prefix=NO_SSE_41
				; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='+sse4.1' -o - \| FileCheck %s -check-prefix=SSE_41

				; PR20472 ( http://llvm.org/bugs/show_bug.cgi?id=20472 )
				; When sexting a trunc'd vector value, we can't eliminate the zext.
				; If we don't have SSE4.1, use punpck.
				; If we have SSE4.1, use pmovzx because it combines the load op.
				; There may be a better way to do this using pshufb + pmovsx,
				; but that is beyond our current codegen capabilities.

				define <4 x i32> @trunc_sext(<4 x i16>* %in) {
				%load = load <4 x i16>* %in
				%trunc = trunc <4 x i16> %load to <4 x i8>
				%sext = sext <4 x i8> %trunc to <4 x i32>
				ret <4 x i32> %sext

				; NO_SSE_41-LABEL: trunc_sext:
				; NO_SSE_41: movq (%rdi), %xmm0
				; NO_SSE_41-NEXT: punpcklwd %xmm0, %xmm0
				; NO_SSE_41-NEXT: pslld $24, %xmm0
				; NO_SSE_41-NEXT: psrad $24, %xmm0
				; NO_SSE_41-NEXT: retq

				; SSE_41-LABEL: trunc_sext:
				; SSE_41: pmovzxwd (%rdi), %xmm0
				; SSE_41-NEXT: pslld $24, %xmm0
				; SSE_41-NEXT: psrad $24, %xmm0
				; SSE_41-NEXT: retq
				}