diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h --- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h +++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h @@ -101,6 +101,10 @@ // Value was lowered to tied def and gc.relocate should be replaced with // copy from vreg. VReg, + // Value was lowered to tied def and gc.relocate should be replaced with + // SDValue kept in StatepointLoweringInfo structure. This valid for local + // relocates only. + SDValueNode, } type = NoRelocate; // Payload contains either frame index of the stack slot in which the value // was spilled, or virtual register which contains the re-definition. diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -884,8 +884,9 @@ DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops); DAG.setNodeMemRefs(StatepointMCNode, MemRefs); - // For values lowered to tied-defs, create the virtual registers. Note that - // for simplicity, we *always* create a vreg even within a single block. + // For values lowered to tied-defs, create the virtual registers if used + // in other blocks. For local gc.relocate record appropriate statepoint + // result in StatepointLoweringState. DenseMap VirtRegs; for (const auto *Relocate : SI.GCRelocates) { Value *Derived = Relocate->getDerivedPtr(); @@ -893,12 +894,23 @@ if (!LowerAsVReg.count(SD)) continue; + SDValue Relocated = SDValue(StatepointMCNode, LowerAsVReg[SD]); + + // Handle local relocate. Note that different relocates might + // map to the same SDValue. + if (SI.StatepointInstr->getParent() == Relocate->getParent()) { + SDValue Res = StatepointLowering.getLocation(SD); + if (Res) + assert(Res == Relocated); + else + StatepointLowering.setLocation(SD, Relocated); + continue; + } + // Handle multiple gc.relocates of the same input efficiently. if (VirtRegs.count(SD)) continue; - SDValue Relocated = SDValue(StatepointMCNode, LowerAsVReg[SD]); - auto *RetTy = Relocate->getType(); Register Reg = FuncInfo.CreateRegs(RetTy); RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), @@ -919,8 +931,13 @@ SDValue SDV = getValue(V); SDValue Loc = StatepointLowering.getLocation(SDV); + bool IsLocal = (Relocate->getParent() == StatepointInstr->getParent()); + RecordType Record; - if (LowerAsVReg.count(SDV)) { + if (IsLocal && LowerAsVReg.count(SDV)) { + // Result is already stored in StatepointLowering + Record.type = RecordType::SDValueNode; + } else if (LowerAsVReg.count(SDV)) { Record.type = RecordType::VReg; assert(VirtRegs.count(SDV)); Record.payload.Reg = VirtRegs[SDV]; @@ -1219,6 +1236,14 @@ const RecordType &Record = SlotIt->second; // If relocation was done via virtual register.. + if (Record.type == RecordType::SDValueNode) { + assert(Relocate.getStatepoint()->getParent() == Relocate.getParent() && + "Nonlocal gc.relocate mapped via SDValue"); + SDValue SDV = StatepointLowering.getLocation(getValue(DerivedPtr)); + assert(SDV.getNode() && "empty SDValue"); + setValue(&Relocate, SDV); + return; + } if (Record.type == RecordType::VReg) { Register InReg = Record.payload.Reg; RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-details.ll b/llvm/test/CodeGen/X86/statepoint-vreg-details.ll --- a/llvm/test/CodeGen/X86/statepoint-vreg-details.ll +++ b/llvm/test/CodeGen/X86/statepoint-vreg-details.ll @@ -322,14 +322,14 @@ ;CHECK-VREG: %1:gr64 = COPY $rsi ;CHECK-VREG: %0:gr64 = COPY $rdi ;CHECK-VREG: TEST32rr %2, %2, implicit-def $eflags -;CHECK-VREG: %5:gr64 = CMOV64rr %1, %0, 4, implicit $eflags -;CHECK-VREG: %6:gr32 = MOV32r0 implicit-def dead $eflags -;CHECK-VREG: %7:gr64 = SUBREG_TO_REG 0, killed %6, %subreg.sub_32bit -;CHECK-VREG: $rdi = COPY %7 -;CHECK-VREG: $rsi = COPY %5 -;CHECK-VREG: %3:gr64, %4:gr64 = STATEPOINT 10, 0, 2, @bar, $rdi, $rsi, 2, 0, 2, 0, 2, 0, 2, 2, %1(tied-def 0), %0(tied-def 1), 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp +;CHECK-VREG: %3:gr64 = CMOV64rr %1, %0, 4, implicit $eflags +;CHECK-VREG: %4:gr32 = MOV32r0 implicit-def dead $eflags +;CHECK-VREG: %5:gr64 = SUBREG_TO_REG 0, killed %4, %subreg.sub_32bit +;CHECK-VREG: $rdi = COPY %5 +;CHECK-VREG: $rsi = COPY %3 +;CHECK-VREG: %6:gr64, %7:gr64 = STATEPOINT 10, 0, 2, @bar, $rdi, $rsi, 2, 0, 2, 0, 2, 0, 2, 2, %1(tied-def 0), %0(tied-def 1), 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp ;CHECK-VREG: TEST32rr %2, %2, implicit-def $eflags -;CHECK-VREG: %8:gr64 = CMOV64rr %3, %4, 4, implicit $eflags +;CHECK-VREG: %8:gr64 = CMOV64rr %6, killed %7, 4, implicit $eflags ;CHECK-VREG: $rax = COPY %8 ;CHECK-VREG: RET 0, $rax entry: @@ -342,14 +342,14 @@ ret i8 addrspace(1)* %res } -; Show that ISEL of gc.relocate used in other BB does generate extra COPY instruction. +; Check that ISEL of gc.relocate used in other BB does not generate extra COPY instruction. define i1 @test_cross_bb_reloc(i32 addrspace(1)* %a, i1 %external_cond) gc "statepoint-example" { ; CHECK-VREG_LABEL: test_cross_bb_reloc: ; CHECK-VREG: bb.0.entry: ; CHECK-VREG: [[VREG:%[^ ]+]]:gr64 = STATEPOINT 0, 0, 0, @return_i1, 2, 0, 2, 0, 2, 0, 2, 1, %2(tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def $al -; CHECK-VREG: [[EXTRA:%[^ ]+]]:gr64 = COPY [[VREG]] +; CHECK-VREG-NOT: COPY [[VREG]] ; CHECK-VREG: bb.1.left: -; CHECK-VREG: $rdi = COPY [[EXTRA]] +; CHECK-VREG: $rdi = COPY [[VREG]] ; CHECK-VREG: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp ; CHECK-VREG: $al = COPY %1 ; CHECK-VREG: RET 0, $al diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll --- a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll +++ b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll @@ -12,43 +12,43 @@ i32 addrspace(1)* %arg12, i32 addrspace(1)* %arg13, i32 addrspace(1)* %arg14, i32 addrspace(1)* %arg15, i32 addrspace(1)* %arg16, i32 addrspace(1)* %arg17 ) gc "statepoint-example" { ; CHECK-VREG-LABEL: test_spill -; CHECK-VREG: %18:gr64 = COPY $r9 -; CHECK-VREG: %19:gr64 = COPY $r8 -; CHECK-VREG: %20:gr64 = COPY $rcx -; CHECK-VREG: %21:gr64 = COPY $rdx -; CHECK-VREG: %22:gr64 = COPY $rsi -; CHECK-VREG: %23:gr64 = COPY $rdi -; CHECK-VREG: %17:gr64 = MOV64rm %fixed-stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.11, align 16) -; CHECK-VREG: %16:gr64 = MOV64rm %fixed-stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.10) -; CHECK-VREG: %15:gr64 = MOV64rm %fixed-stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.9, align 16) -; CHECK-VREG: %14:gr64 = MOV64rm %fixed-stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.8) -; CHECK-VREG: %13:gr64 = MOV64rm %fixed-stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.7, align 16) -; CHECK-VREG: %12:gr64 = MOV64rm %fixed-stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.6) -; CHECK-VREG: %11:gr64 = MOV64rm %fixed-stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.5, align 16) -; CHECK-VREG: %10:gr64 = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4) -; CHECK-VREG: %9:gr64 = MOV64rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16) -; CHECK-VREG: %8:gr64 = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2) -; CHECK-VREG: %7:gr64 = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16) -; CHECK-VREG: %6:gr64 = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0) -; CHECK-VREG: %6:gr64, %7:gr64, %8:gr64, %9:gr64, %10:gr64, %11:gr64, %12:gr64, %13:gr64, %14:gr64, %15:gr64, %16:gr64, %17:gr64, %18:gr64, %19:gr64, %20:gr64, %21:gr64, %22:gr64, %23:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, %6(tied-def 0), %7(tied-def 1), %8(tied-def 2), %9(tied-def 3), %10(tied-def 4), %11(tied-def 5), %12(tied-def 6), %13(tied-def 7), %14(tied-def 8), %15(tied-def 9), %16(tied-def 10), %17(tied-def 11), %18(tied-def 12), %19(tied-def 13), %20(tied-def 14), %21(tied-def 15), %22(tied-def 16), %23(tied-def 17), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp -; CHECK-VREG: %38:gr32 = MOV32rm %23, 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %22, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %21, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %20, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %19, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %18, 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %17, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %16, 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %15, 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %14, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %13, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %12, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %11, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %10, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %9, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %8, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %7, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1) -; CHECK-VREG: %38:gr32 = ADD32rm %38, %6, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1) +; CHECK-VREG: %30:gr64 = COPY $r9 +; CHECK-VREG: %31:gr64 = COPY $r8 +; CHECK-VREG: %32:gr64 = COPY $rcx +; CHECK-VREG: %33:gr64 = COPY $rdx +; CHECK-VREG: %34:gr64 = COPY $rsi +; CHECK-VREG: %35:gr64 = COPY $rdi +; CHECK-VREG: %29:gr64 = MOV64rm %fixed-stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.11, align 16) +; CHECK-VREG: %28:gr64 = MOV64rm %fixed-stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.10) +; CHECK-VREG: %27:gr64 = MOV64rm %fixed-stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.9, align 16) +; CHECK-VREG: %26:gr64 = MOV64rm %fixed-stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.8) +; CHECK-VREG: %25:gr64 = MOV64rm %fixed-stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.7, align 16) +; CHECK-VREG: %24:gr64 = MOV64rm %fixed-stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.6) +; CHECK-VREG: %23:gr64 = MOV64rm %fixed-stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.5, align 16) +; CHECK-VREG: %22:gr64 = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4) +; CHECK-VREG: %21:gr64 = MOV64rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16) +; CHECK-VREG: %20:gr64 = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2) +; CHECK-VREG: %19:gr64 = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16) +; CHECK-VREG: %18:gr64 = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0) +; CHECK-VREG: %18:gr64, %19:gr64, %20:gr64, %21:gr64, %22:gr64, %23:gr64, %24:gr64, %25:gr64, %26:gr64, %27:gr64, %28:gr64, %29:gr64, %30:gr64, %31:gr64, %32:gr64, %33:gr64, %34:gr64, %35:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, %18(tied-def 0), %19(tied-def 1), %20(tied-def 2), %21(tied-def 3), %22(tied-def 4), %23(tied-def 5), %24(tied-def 6), %25(tied-def 7), %26(tied-def 8), %27(tied-def 9), %28(tied-def 10), %29(tied-def 11), %30(tied-def 12), %31(tied-def 13), %32(tied-def 14), %33(tied-def 15), %34(tied-def 16), %35(tied-def 17), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp +; CHECK-VREG: %38:gr32 = MOV32rm %35, 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %34, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %33, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %32, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %31, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %30, 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %29, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %28, 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %27, 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %26, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %25, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %24, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %23, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %22, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %21, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %20, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %19, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1) +; CHECK-VREG: %38:gr32 = ADD32rm %38, %18, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1) ; CHECK-VREG: $eax = COPY %38 ; CHECK-PREG: renamable $rbx = COPY $r9 diff --git a/llvm/test/CodeGen/X86/statepoint-vreg.ll b/llvm/test/CodeGen/X86/statepoint-vreg.ll --- a/llvm/test/CodeGen/X86/statepoint-vreg.ll +++ b/llvm/test/CodeGen/X86/statepoint-vreg.ll @@ -434,20 +434,20 @@ ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: nopl 8(%rax,%rax) ; CHECK-NEXT: .Ltmp14: -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss %xmm0, (%rsp) -; CHECK-NEXT: nopl 8(%rax,%rax) -; CHECK-NEXT: .Ltmp15: +; CHECK-NEXT: movss %xmm0, {{[-0-9]*}}(%rsp) ; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: nopl 8(%rax,%rax) +; CHECK-NEXT: .Ltmp15: ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss %xmm0, (%rsp) +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero +; CHECK-NEXT: movsd %xmm1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: nopl 8(%rax,%rax) ; CHECK-NEXT: .Ltmp16: ; CHECK-NEXT: xorl %eax, %eax