diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19323,21 +19323,31 @@ /// looking for aliasing nodes and adding them to the Aliases vector. void DAGCombiner::GatherAllAliases(LSBaseSDNode *N, SDValue OriginalChain, SmallVectorImpl &Aliases) { - SmallVector Chains; // List of chains to visit. - SmallPtrSet Visited; // Visited node set. + + // Chains holds pairs of chains value: (The first is the chain being + // considered and the second is the chain to select should aliasing fail. + + SmallVector, 16> + Chains; + SmallPtrSet Visited; // Get alias information for node. bool IsLoad = isa(N) && !N->isVolatile(); // Starting off. - Chains.push_back(OriginalChain); + Chains.push_back(std::make_pair(OriginalChain, OriginalChain)); unsigned Depth = 0; // Look at each chain and determine if it is an alias. If so, add it to the // aliases list. If not, then continue up the chain looking for the next // candidate. - while (!Chains.empty()) { - SDValue Chain = Chains.pop_back_val(); + SDValue Chain, DepOnAlias; + for (unsigned ChainIdx = 0; ChainIdx < Chains.size(); ++ChainIdx) { + std::tie(Chain, DepOnAlias) = Chains[ChainIdx]; + + // Don't bother if we've been before. + if (!Visited.insert(Chain.getNode()).second) + continue; // For TokenFactor nodes, look at each operand and only continue up the // chain until we reach the depth limit. @@ -19351,10 +19361,6 @@ return; } - // Don't bother if we've been before. - if (!Visited.insert(Chain.getNode()).second) - continue; - switch (Chain.getOpcode()) { case ISD::EntryToken: // Entry token is ideal chain operand, but handled in FindBetterChain. @@ -19366,13 +19372,15 @@ bool IsOpLoad = isa(Chain.getNode()) && !cast(Chain.getNode())->isVolatile(); - // If chain is alias then stop here. + // If chain is an alias then stop here. if (!(IsLoad && IsOpLoad) && - isAlias(N, cast(Chain.getNode()))) { - Aliases.push_back(Chain); + isAlias(cast(N), cast(Chain.getNode()))) { + Visited.insert(Chain->getOperand(0).getNode()); + Aliases.push_back(DepOnAlias); } else { // Look further up the chain. - Chains.push_back(Chain.getOperand(0)); + Chains.push_back( + std::make_pair(Chain.getOperand(0), Chain.getOperand(0))); ++Depth; } break; @@ -19381,26 +19389,75 @@ case ISD::TokenFactor: // We have to check each of the operands of the token factor for "small" // token factors, so we queue them up. Adding the operands to the queue - // (stack) in reverse order maintains the original order and increases the + // (stack) in order maintains the original order and increases the // likelihood that getNode will find a matching token factor (CSE.) if (Chain.getNumOperands() > 16) { - Aliases.push_back(Chain); + for (const SDValue &C : Chain->op_values()) + Visited.insert(C.getNode()); + Aliases.push_back(DepOnAlias); break; } - for (unsigned n = Chain.getNumOperands(); n;) - Chains.push_back(Chain.getOperand(--n)); + for (unsigned n = 0; n < Chain.getNumOperands(); ++n) { + auto NewChain = Chain.getOperand(n); + Chains.push_back(std::make_pair(NewChain, NewChain)); + } ++Depth; break; - case ISD::CopyFromReg: + case ISD::CopyFromReg: { // Forward past CopyFromReg. - Chains.push_back(Chain.getOperand(0)); - ++Depth; + auto LastOpNo = Chain->getNumOperands() - 1; + bool isGlued = Chain->getOperand(LastOpNo).getValueType() == MVT::Glue && + Chain->getOperand(0).getNode() == + Chain->getOperand(LastOpNo).getNode(); + // keep the old alias value if we're in the middle of a glued region. + auto NewDepOnAlias = (isGlued) ? DepOnAlias : Chain.getOperand(0); + Chains.push_back(std::make_pair(Chain.getOperand(0), NewDepOnAlias)); + break; + } + + case ISD::CopyToReg: { + // Forward past CopyToReg nodes. Do not pass by glueless instances, as we + // use this to expose live values into the DAG. + auto LastOpNo = Chain->getNumOperands() - 1; + // Input Glue + bool hasGlueInput = + Chain->getOperand(LastOpNo).getValueType() == MVT::Glue && + Chain->getOperand(0).getNode() == + Chain->getOperand(LastOpNo).getNode(); + if (hasGlueInput || Chain != DepOnAlias) + Chains.push_back(std::make_pair(Chain.getOperand(0), DepOnAlias)); + else { + Visited.insert(Chain->getOperand(0).getNode()); + Aliases.push_back(DepOnAlias); + } break; + } + + case ISD::INLINEASM: { + auto LastOpNo = Chain->getNumOperands() - 1; + bool isGlued = Chain->getOperand(LastOpNo).getValueType() == MVT::Glue && + Chain->getOperand(0).getNode() == + Chain->getOperand(LastOpNo).getNode(); + auto *ExtraInfo = + cast(Chain->getOperand(InlineAsm::Op_ExtraInfo)); + unsigned EIInt = ExtraInfo->getZExtValue(); + bool IsSafe = !(EIInt & InlineAsm::Extra_MayStore); + if (!IsLoad) + IsSafe = IsSafe && (EIInt & (InlineAsm::Extra_MayLoad)); + + auto NewDepOnAlias = (isGlued) ? DepOnAlias : Chain.getOperand(0); + if (IsSafe) { + Chains.push_back(std::make_pair(Chain.getOperand(0), NewDepOnAlias)); + break; + } + } + LLVM_FALLTHROUGH; default: // For all other instructions we will just have to take what we can get. - Aliases.push_back(Chain); + Visited.insert(Chain->getOperand(0).getNode()); + Aliases.push_back(DepOnAlias); break; } } diff --git a/llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll --- a/llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -9,10 +9,6 @@ define void @test_simple(i32 %n, ...) { ; CHECK-LABEL: test_simple: ; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]] -; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]] - -; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var -; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var ; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]] ; ... omit middle ones ... @@ -22,6 +18,10 @@ ; ... omit middle ones ... ; CHECK: stp q6, q7, [sp, # +; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]] +; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var +; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var + ; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]] ; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]] @@ -45,10 +45,6 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) { ; CHECK-LABEL: test_fewargs: ; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]] -; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]] - -; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var -; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var ; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]] ; ... omit middle ones ... @@ -58,6 +54,10 @@ ; ... omit middle ones ... ; CHECK: str q7, [sp, # +; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]] +; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var +; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var + ; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]] ; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]] diff --git a/llvm/test/CodeGen/Mips/cconv/arguments-varargs.ll b/llvm/test/CodeGen/Mips/cconv/arguments-varargs.ll --- a/llvm/test/CodeGen/Mips/cconv/arguments-varargs.ll +++ b/llvm/test/CodeGen/Mips/cconv/arguments-varargs.ll @@ -122,12 +122,12 @@ %ap2 = bitcast i8** %ap to i8* call void @llvm.va_start(i8* %ap2) - call void asm sideeffect "teqi $$zero, 1", ""() + call void asm sideeffect "teqi $$zero, 1", "~{memory}"() %arg1 = va_arg i8** %ap, i16 %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1 store volatile i16 %arg1, i16* %e1, align 2 - call void asm sideeffect "teqi $$zero, 2", ""() + call void asm sideeffect "teqi $$zero, 2", "~{memory}"() %arg2 = va_arg i8** %ap, i16 %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2 store volatile i16 %arg2, i16* %e2, align 2 @@ -237,12 +237,12 @@ %ap2 = bitcast i8** %ap to i8* call void @llvm.va_start(i8* %ap2) - call void asm sideeffect "teqi $$zero, 1", ""() + call void asm sideeffect "teqi $$zero, 1", "~{memory}"() %arg1 = va_arg i8** %ap, i32 %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1 store volatile i32 %arg1, i32* %e1, align 4 - call void asm sideeffect "teqi $$zero, 2", ""() + call void asm sideeffect "teqi $$zero, 2", "~{memory}"() %arg2 = va_arg i8** %ap, i32 %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2 store volatile i32 %arg2, i32* %e2, align 4 @@ -359,12 +359,12 @@ %ap2 = bitcast i8** %ap to i8* call void @llvm.va_start(i8* %ap2) - call void asm sideeffect "teqi $$zero, 1", ""() + call void asm sideeffect "teqi $$zero, 1", "~{memory}"() %arg1 = va_arg i8** %ap, i64 %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1 store volatile i64 %arg1, i64* %e1, align 8 - call void asm sideeffect "teqi $$zero, 2", ""() + call void asm sideeffect "teqi $$zero, 2", "~{memory}"() %arg2 = va_arg i8** %ap, i64 %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2 store volatile i64 %arg2, i64* %e2, align 8 @@ -474,12 +474,12 @@ %ap2 = bitcast i8** %ap to i8* call void @llvm.va_start(i8* %ap2) - call void asm sideeffect "teqi $$zero, 1", ""() + call void asm sideeffect "teqi $$zero, 1", "~{memory}"() %arg1 = va_arg i8** %ap, i16 %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1 store volatile i16 %arg1, i16* %e1, align 2 - call void asm sideeffect "teqi $$zero, 2", ""() + call void asm sideeffect "teqi $$zero, 2", "~{memory}"() %arg2 = va_arg i8** %ap, i16 %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2 store volatile i16 %arg2, i16* %e2, align 2 @@ -589,12 +589,12 @@ %ap2 = bitcast i8** %ap to i8* call void @llvm.va_start(i8* %ap2) - call void asm sideeffect "teqi $$zero, 1", ""() + call void asm sideeffect "teqi $$zero, 1", "~{memory}"() %arg1 = va_arg i8** %ap, i32 %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1 store volatile i32 %arg1, i32* %e1, align 4 - call void asm sideeffect "teqi $$zero, 2", ""() + call void asm sideeffect "teqi $$zero, 2", "~{memory}"() %arg2 = va_arg i8** %ap, i32 %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2 store volatile i32 %arg2, i32* %e2, align 4 @@ -711,12 +711,12 @@ %ap2 = bitcast i8** %ap to i8* call void @llvm.va_start(i8* %ap2) - call void asm sideeffect "teqi $$zero, 1", ""() + call void asm sideeffect "teqi $$zero, 1", "~{memory}"() %arg1 = va_arg i8** %ap, i64 %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1 store volatile i64 %arg1, i64* %e1, align 8 - call void asm sideeffect "teqi $$zero, 2", ""() + call void asm sideeffect "teqi $$zero, 2", "~{memory}"() %arg2 = va_arg i8** %ap, i64 %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2 store volatile i64 %arg2, i64* %e2, align 8 @@ -825,12 +825,12 @@ %ap2 = bitcast i8** %ap to i8* call void @llvm.va_start(i8* %ap2) - call void asm sideeffect "teqi $$zero, 1", ""() + call void asm sideeffect "teqi $$zero, 1", "~{memory}"() %arg1 = va_arg i8** %ap, i16 %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1 store volatile i16 %arg1, i16* %e1, align 2 - call void asm sideeffect "teqi $$zero, 2", ""() + call void asm sideeffect "teqi $$zero, 2", "~{memory}"() %arg2 = va_arg i8** %ap, i16 %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2 store volatile i16 %arg2, i16* %e2, align 2 @@ -939,12 +939,12 @@ %ap2 = bitcast i8** %ap to i8* call void @llvm.va_start(i8* %ap2) - call void asm sideeffect "teqi $$zero, 1", ""() + call void asm sideeffect "teqi $$zero, 1", "~{memory}"() %arg1 = va_arg i8** %ap, i32 %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1 store volatile i32 %arg1, i32* %e1, align 4 - call void asm sideeffect "teqi $$zero, 2", ""() + call void asm sideeffect "teqi $$zero, 2", "~{memory}"() %arg2 = va_arg i8** %ap, i32 %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2 store volatile i32 %arg2, i32* %e2, align 4 @@ -1060,12 +1060,12 @@ %ap2 = bitcast i8** %ap to i8* call void @llvm.va_start(i8* %ap2) - call void asm sideeffect "teqi $$zero, 1", ""() + call void asm sideeffect "teqi $$zero, 1", "~{memory}"() %arg1 = va_arg i8** %ap, i64 %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1 store volatile i64 %arg1, i64* %e1, align 8 - call void asm sideeffect "teqi $$zero, 2", ""() + call void asm sideeffect "teqi $$zero, 2", "~{memory}"() %arg2 = va_arg i8** %ap, i64 %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2 store volatile i64 %arg2, i64* %e2, align 8 diff --git a/llvm/test/CodeGen/SystemZ/pr36164.ll b/llvm/test/CodeGen/SystemZ/pr36164.ll --- a/llvm/test/CodeGen/SystemZ/pr36164.ll +++ b/llvm/test/CodeGen/SystemZ/pr36164.ll @@ -17,18 +17,18 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lhi %r0, 1 ; CHECK-NEXT: larl %r1, g_938 -; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lhi %r2, 3 ; CHECK-NEXT: lhi %r3, 4 ; CHECK-NEXT: larl %r4, g_11 ; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: strl %r0, g_73 -; CHECK-NEXT: strl %r2, g_69 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 +; CHECK-NEXT: strl %r2, g_69 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 diff --git a/llvm/test/CodeGen/Thumb/frame-access.ll b/llvm/test/CodeGen/Thumb/frame-access.ll --- a/llvm/test/CodeGen/Thumb/frame-access.ll +++ b/llvm/test/CodeGen/Thumb/frame-access.ll @@ -280,9 +280,9 @@ ; CHECK-NEXT: add r2, sp, #12 ; CHECK-NEXT: bl h ; Load `x`, `y`, and `z` via SP -; CHECK: ldr r1, [sp, #20] -; CHECK-NEXT: ldr r2, [sp, #16] -; CHECK-NEXT: ldr r3, [sp, #12] +; CHECK-DAG: ldr r1, [sp, #20] +; CHECK-DAG: ldr r2, [sp, #16] +; CHECK-DAG: ldr r3, [sp, #12] ; CHECK: bl g ; Re-aligned stack, access via SP. @@ -324,9 +324,9 @@ ; CHECK-NEXT: add r2, sp, #20 ; CHECK-NEXT: bl h ; Load `x`, `y`, and `z` via SP for passing to `g` -; CHECK: ldr r1, [sp, #28] -; CHECK-NEXT: ldr r2, [sp, #24] -; CHECK-NEXT: ldr r3, [sp, #20] +; CHECK-DAG: ldr r1, [sp, #28] +; CHECK-DAG: ldr r2, [sp, #24] +; CHECK-DAG: ldr r3, [sp, #20] ; CHECK: bl g ; VLAs, access via BP. diff --git a/llvm/test/CodeGen/X86/inline-asm-fpstack.ll b/llvm/test/CodeGen/X86/inline-asm-fpstack.ll --- a/llvm/test/CodeGen/X86/inline-asm-fpstack.ll +++ b/llvm/test/CodeGen/X86/inline-asm-fpstack.ll @@ -209,17 +209,12 @@ ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: fldt (%eax) -; CHECK-NEXT: flds LCPI10_0 -; CHECK-NEXT: fmul %st, %st(1) -; CHECK-NEXT: flds LCPI10_1 -; CHECK-NEXT: fmul %st, %st(2) -; CHECK-NEXT: fxch %st(2) +; CHECK-NEXT: fmuls LCPI10_0 +; CHECK-NEXT: fmuls LCPI10_1 +; CHECK-NEXT: fld %st(0) ; CHECK-NEXT: ## InlineAsm Start ; CHECK-NEXT: fistpl %st ; CHECK-NEXT: ## InlineAsm End -; CHECK-NEXT: fldt (%eax) -; CHECK-NEXT: fmulp %st, %st(1) -; CHECK-NEXT: fmulp %st, %st(1) ; CHECK-NEXT: ## InlineAsm Start ; CHECK-NEXT: fistpl %st ; CHECK-NEXT: ## InlineAsm End diff --git a/llvm/test/CodeGen/X86/pr9517.ll b/llvm/test/CodeGen/X86/pr9517.ll --- a/llvm/test/CodeGen/X86/pr9517.ll +++ b/llvm/test/CodeGen/X86/pr9517.ll @@ -11,7 +11,6 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movzwl {{.*}}(%rip), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq @@ -22,7 +21,7 @@ ret i16 %v } -; The asm call prevents the merging the loads here. +; The asm call prevents the merging the loads here. define i16 @unify_through_trival_asm_w_memory_clobber() { ; CHECK-LABEL: unify_through_trival_asm_w_memory_clobber: ; CHECK: # %bb.0: @@ -47,61 +46,37 @@ ; CHECK-NEXT: movzwl {{.*}}(%rip), %edx ; CHECK-NEXT: addl $16, %edx ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: #APP ; CHECK-NEXT: outb %al, %dx ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movzwl {{.*}}(%rip), %edx -; CHECK-NEXT: addl $16, %edx ; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: #APP ; CHECK-NEXT: outb %al, %dx ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movzwl {{.*}}(%rip), %edx -; CHECK-NEXT: addl $16, %edx ; CHECK-NEXT: movb $2, %al -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: #APP ; CHECK-NEXT: outb %al, %dx ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movzwl {{.*}}(%rip), %edx -; CHECK-NEXT: addl $16, %edx ; CHECK-NEXT: movb $3, %al -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: #APP ; CHECK-NEXT: outb %al, %dx ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movzwl {{.*}}(%rip), %edx -; CHECK-NEXT: addl $16, %edx ; CHECK-NEXT: movb $4, %al -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: #APP ; CHECK-NEXT: outb %al, %dx ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movzwl {{.*}}(%rip), %edx -; CHECK-NEXT: addl $16, %edx ; CHECK-NEXT: movb $5, %al -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: #APP ; CHECK-NEXT: outb %al, %dx ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movzwl {{.*}}(%rip), %edx -; CHECK-NEXT: addl $16, %edx ; CHECK-NEXT: movb $6, %al -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: #APP ; CHECK-NEXT: outb %al, %dx ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movzwl {{.*}}(%rip), %edx -; CHECK-NEXT: addl $16, %edx ; CHECK-NEXT: movb $7, %al -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: #APP ; CHECK-NEXT: outb %al, %dx ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movzwl {{.*}}(%rip), %edx -; CHECK-NEXT: addl $16, %edx ; CHECK-NEXT: movb $8, %al ; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: #APP diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -460,10 +460,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpaddsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpaddsw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpaddsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -460,10 +460,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpsubsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpsubsw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpsubsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -460,10 +460,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpaddusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpaddusw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpaddusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -460,10 +460,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpsubusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpsubusw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpsubusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: