Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1970,13 +1970,12 @@ MachineMemOperand::MOStore, HiMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges()); - SDValue OpsHi[] = {Ch, DataHi, MaskHi, Ptr, IndexHi}; - Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(), - DL, OpsHi, MMO); - - // Build a factor node to remember that this store is independent of the - // other one. - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); + // The order of the Scatter operation after split is well defined. The "Hi" + // part comes after the "Lo". So these two operations should be chained one + // after another. + SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi}; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(), + DL, OpsHi, MMO); } SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) { Index: test/CodeGen/X86/scatter-schedule.ll =================================================================== --- test/CodeGen/X86/scatter-schedule.ll +++ test/CodeGen/X86/scatter-schedule.ll @@ -5,16 +5,15 @@ ; This test checks the order of scatter operations after split. ; The right order is "from LSB to MSB", otherwise the semantic is broken. -; The submitted version of the test demonstrates the bug. define void @test(i64 %x272, <16 x i32*> %x335, <16 x i32> %x270) { ; CHECK-LABEL: test: ; CHECK: # BB#0: -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: kxnorw %k0, %k0, %k2 -; CHECK-NEXT: vpscatterqd %ymm3, (,%zmm1) {%k2} -; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1} +; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k2} +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; CHECK-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %x270, <16 x i32*> %x335, i32 4, <16 x i1> )