Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -6058,7 +6058,6 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { - EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); LoadSDNode *LDBase = nullptr; @@ -6069,7 +6068,9 @@ // non-consecutive, bail out. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Elts[i]; - + // Look through a bitcast. + if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) + Elt = Elt.getOperand(0); if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return SDValue(); @@ -6084,7 +6085,8 @@ continue; LoadSDNode *LD = cast(Elt); - if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) + EVT LdVT = Elt.getValueType(); + if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) return SDValue(); LastLoadedElt = i; } @@ -6119,6 +6121,7 @@ //TODO: The code below fires only for for loading the low v2i32 / v2f32 //of a v4i32 / v4f32. It's probably worth generalizing. + EVT EltVT = VT.getVectorElementType(); if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); @@ -13206,7 +13209,19 @@ SDValue Idx = Op.getOperand(2); MVT OpVT = Op.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); - + + // Fold two 16-byte subvector loads into one 32-byte load: + // (insert_subvector (insert_subvector undef, (load addr)), (load addr + 16)) + // --> load32 addr + if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && + OpVT.is256BitVector() && + !Subtarget->isUnalignedMem32Slow()) { + SDValue Ops[] = { Vec.getOperand(1), SubVec }; + SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false); + if (LD.getNode()) + return LD; + } + if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && SubVecVT.is128BitVector() && isa(Idx)) { unsigned IdxVal = cast(Idx)->getZExtValue(); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -8141,49 +8141,6 @@ (INSERT_get_vinsert128_imm VR256:$ins))>; } -// Combine two consecutive 16-byte loads with a common destination register into -// one 32-byte load to that register. -let Predicates = [HasAVX, HasFastMem32] in { - def : Pat<(insert_subvector - (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))), - (loadv4f32 (add addr:$src, (iPTR 16))), - (iPTR 4)), - (VMOVUPSYrm addr:$src)>; - - def : Pat<(insert_subvector - (v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))), - (loadv2f64 (add addr:$src, (iPTR 16))), - (iPTR 2)), - (VMOVUPDYrm addr:$src)>; - - def : Pat<(insert_subvector - (v32i8 (insert_subvector - undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))), - (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))), - (iPTR 16)), - (VMOVDQUYrm addr:$src)>; - - def : Pat<(insert_subvector - (v16i16 (insert_subvector - undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))), - (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))), - (iPTR 8)), - (VMOVDQUYrm addr:$src)>; - - def : Pat<(insert_subvector - (v8i32 (insert_subvector - undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))), - (bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))), - (iPTR 4)), - (VMOVDQUYrm addr:$src)>; - - def : Pat<(insert_subvector - (v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))), - (loadv2i64 (add addr:$src, (iPTR 16))), - (iPTR 2)), - (VMOVDQUYrm addr:$src)>; -} - let Predicates = [HasAVX1Only] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), Index: test/CodeGen/X86/unaligned-32-byte-memops.ll =================================================================== --- test/CodeGen/X86/unaligned-32-byte-memops.ll +++ test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -65,8 +65,9 @@ ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 - %v1 = load <4 x float>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2 + %v1 = load <4 x float>* %ptr1, align 1 %v2 = load <4 x float>* %ptr2, align 1 %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) @@ -88,8 +89,9 @@ ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 - %v1 = load <4 x float>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3 + %v1 = load <4 x float>* %ptr1, align 1 %v2 = load <4 x float>* %ptr2, align 1 %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0) @@ -111,8 +113,9 @@ ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 - %v1 = load <4 x float>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4 + %v1 = load <4 x float>* %ptr1, align 1 %v2 = load <4 x float>* %ptr2, align 1 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> ret <8 x float> %v3 @@ -133,8 +136,9 @@ ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 - %v1 = load <4 x float>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5 + %v1 = load <4 x float>* %ptr1, align 1 %v2 = load <4 x float>* %ptr2, align 1 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> ret <8 x float> %v3 @@ -160,12 +164,13 @@ ; BTVER2-NEXT: vinsertf128 ; BTVER2-NEXT: retq - ; HASWELL: vmovdqu - ; HASWELL-NEXT: vpaddq + ; HASWELL-NOT: vextract + ; HASWELL: vpaddq ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1 - %v1 = load <2 x i64>* %ptr, align 1 + %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5 + %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6 + %v1 = load <2 x i64>* %ptr1, align 1 %v2 = load <2 x i64>* %ptr2, align 1 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> %v4 = add <4 x i64> %v3, %x @@ -187,12 +192,13 @@ ; BTVER2-NEXT: vinsertf128 ; BTVER2-NEXT: retq - ; HASWELL: vmovdqu - ; HASWELL-NEXT: vpaddd + ; HASWELL-NOT: vextract + ; HASWELL: vpaddd ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1 - %v1 = load <4 x i32>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6 + %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7 + %v1 = load <4 x i32>* %ptr1, align 1 %v2 = load <4 x i32>* %ptr2, align 1 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> %v4 = add <8 x i32> %v3, %x @@ -214,12 +220,13 @@ ; BTVER2-NEXT: vinsertf128 ; BTVER2-NEXT: retq - ; HASWELL: vmovdqu - ; HASWELL-NEXT: vpaddw + ; HASWELL-NOT: vextract + ; HASWELL: vpaddw ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1 - %v1 = load <8 x i16>* %ptr, align 1 + %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7 + %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8 + %v1 = load <8 x i16>* %ptr1, align 1 %v2 = load <8 x i16>* %ptr2, align 1 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> %v4 = add <16 x i16> %v3, %x @@ -241,12 +248,13 @@ ; BTVER2-NEXT: vinsertf128 ; BTVER2-NEXT: retq - ; HASWELL: vmovdqu - ; HASWELL-NEXT: vpaddb + ; HASWELL-NOT: vextract + ; HASWELL: vpaddb ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1 - %v1 = load <16 x i8>* %ptr, align 1 + %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8 + %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9 + %v1 = load <16 x i8>* %ptr1, align 1 %v2 = load <16 x i8>* %ptr2, align 1 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> %v4 = add <32 x i8> %v3, %x @@ -261,16 +269,17 @@ ; SANDYB-NEXT: vaddpd ; SANDYB-NEXT: retq - ; BTVER2: vmovupd - ; BTVER2-NEXT: vaddpd + ; BTVER2-NOT: vinsertf128 + ; BTVER2: vaddpd ; BTVER2-NEXT: retq - ; HASWELL: vmovupd + ; HASWELL-NOT: vinsertf128 ; HASWELL: vaddpd ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1 - %v1 = load <2 x double>* %ptr, align 1 + %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9 + %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10 + %v1 = load <2 x double>* %ptr1, align 1 %v2 = load <2 x double>* %ptr2, align 1 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> %v4 = fadd <4 x double> %v3, %x