diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -338,9 +338,9 @@ // Perform the following transformation. // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx // => - // [E|R]BX = InArg + // RBX = InArg // actualcmpxchg Addr - // [E|R]BX = SaveRbx + // RBX = SaveRbx const MachineOperand &InArg = MBBI->getOperand(6); Register SaveRbx = MBBI->getOperand(7).getReg(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -756,7 +756,6 @@ LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, LCMPXCHG16_DAG, - LCMPXCHG8_SAVE_EBX_DAG, LCMPXCHG16_SAVE_RBX_DAG, /// LOCK-prefixed arithmetic read-modify-write instructions. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30481,38 +30481,30 @@ swapInH = DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, swapInH, cpInH.getValue(1)); - // If the current function needs the base pointer, RBX, - // we shouldn't use cmpxchg directly. - // Indeed the lowering of that instruction will clobber - // that register and since RBX will be a reserved register - // the register allocator will not make sure its value will - // be properly saved and restored around this live-range. - const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + + // In 64-bit mode we might need the base pointer in RBX, but we can't know + // until later. So we keep the RBX input in a vreg and use a custom + // inserter. + // Since RBX will be a reserved register the register allocator will not + // make sure its value will be properly saved and restored around this + // live-range. SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - Register BasePtr = TRI->getBaseRegister(); MachineMemOperand *MMO = cast(N)->getMemOperand(); - if (TRI->hasBasePointer(DAG.getMachineFunction()) && - (BasePtr == X86::RBX || BasePtr == X86::EBX)) { - assert(Regs64bit && "RBX/EBX base pointer only expected for i128 CAS"); - SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl, - X86::RBX, - HalfT, swapInH.getValue(1)); - SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL, - RBXSave, - /*Glue*/ RBXSave.getValue(2)}; - Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_SAVE_RBX_DAG, dl, Tys, - Ops, T, MMO); + if (Regs64bit) { + SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL, + swapInH.getValue(1)}; + Result = + DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO); } else { - unsigned Opcode = - Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; - swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, - Regs64bit ? X86::RBX : X86::EBX, swapInL, + swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL, swapInH.getValue(1)); SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), swapInL.getValue(1)}; - Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); + Result = + DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO); } + SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -30811,7 +30803,6 @@ NODE_NAME_CASE(LCMPXCHG_DAG) NODE_NAME_CASE(LCMPXCHG8_DAG) NODE_NAME_CASE(LCMPXCHG16_DAG) - NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG) NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG) NODE_NAME_CASE(LADD) NODE_NAME_CASE(LSUB) @@ -33770,11 +33761,31 @@ return BB; } - case X86::LCMPXCHG16B: - return BB; - case X86::LCMPXCHG16B_SAVE_RBX: { - if (!BB->isLiveIn(X86::RBX)) - BB->addLiveIn(X86::RBX); + case X86::LCMPXCHG16B_NO_RBX: { + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + Register BasePtr = TRI->getBaseRegister(); + X86AddressMode AM = getAddressFromInstr(&MI, 0); + if (TRI->hasBasePointer(*MF) && + (BasePtr == X86::RBX || BasePtr == X86::EBX)) { + if (!BB->isLiveIn(BasePtr)) + BB->addLiveIn(BasePtr); + // Save RBX into a virtual register. + Register SaveRBX = + MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX) + .addReg(X86::RBX); + Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); + addFullAddress( + BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst), AM) + .add(MI.getOperand(X86::AddrNumOperands)) + .addReg(SaveRBX); + } else { + // Simple case, just copy the virtual register to RBX. + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX) + .add(MI.getOperand(X86::AddrNumOperands)); + addFullAddress(BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B)), AM); + } + MI.eraseFromParent(); return BB; } case X86::MWAITX: { diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -809,15 +809,6 @@ } // Atomic compare and swap. -multiclass LCMPXCHG_UnOp Opc, Format Form, string mnemonic, - SDPatternOperator frag, X86MemOperand x86memop> { -let isCodeGenOnly = 1, usesCustomInserter = 1 in { - def NAME : I, TB, LOCK; -} -} - multiclass LCMPXCHG_BinOp Opc8, bits<8> Opc, Format Form, string mnemonic, SDPatternOperator frag> { let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in { @@ -841,14 +832,19 @@ } let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], - Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in { -defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>; + Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, usesCustomInserter = 1 in { +def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), + "cmpxchg8b\t$ptr", + [(X86cas8 addr:$ptr)]>, TB, LOCK; } let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], - Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in { -defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", - X86cas16, i128mem>, REX_W; + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in { +def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), + "cmpxchg16b\t$ptr", + []>, TB, LOCK; } // This pseudo must be used when the frame uses RBX as @@ -872,14 +868,24 @@ // the value of RBX. let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX], Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], - isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst", - usesCustomInserter = 1 in { + isCodeGenOnly = 1, isPseudo = 1, + mayLoad = 1, mayStore = 1, hasSideEffects = 0, + Constraints = "$rbx_save = $dst" in { def LCMPXCHG16B_SAVE_RBX : I<0, Pseudo, (outs GR64:$dst), - (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), - !strconcat("cmpxchg16b", "\t$ptr"), - [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input, - GR64:$rbx_save))]>; + (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), "", []>; +} + +// Pseudo instruction that doesn't read/write RBX. Will be turned into either +// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter. +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX], + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, isPseudo = 1, + mayLoad = 1, mayStore = 1, hasSideEffects = 0, + usesCustomInserter = 1 in { +def LCMPXCHG16B_NO_RBX : + I<0, Pseudo, (outs), (ins i128mem:$ptr, GR64:$rbx_input), "", + [(X86cas16 addr:$ptr, GR64:$rbx_input)]>; } // This pseudo must be used when the frame uses RBX/EBX as diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -69,10 +69,8 @@ def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, SDTCisVT<2, i8>]>; -def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; -def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3, - [SDTCisVT<0, i64>, SDTCisPtrTy<1>, - SDTCisVT<2, i64>, SDTCisVT<3, i64>]>; +def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>; def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, @@ -171,16 +169,12 @@ def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair, +def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, +def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG", - SDTX86caspairSaveRbx16, - [SDNPHasChain, SDNPInGlue, SDNPOutGlue, - SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; diff --git a/llvm/test/CodeGen/X86/pr42064.ll b/llvm/test/CodeGen/X86/pr42064.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr42064.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc19.11.0 -mattr=+avx,+cx16 | FileCheck %s + +%struct.TestStruct = type { %union.Int128 } +%union.Int128 = type { i128 } +%struct.SomeArrays = type { %struct.SillyArray, %struct.SillyArray, %struct.SillyArray } +%struct.SillyArray = type { i8*, i32, i32 } + +declare void @llvm.lifetime.start.p0i8(i64, i8*) + +define void @foo(%struct.TestStruct* %arg) align 2 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +; Check that %rbx is being used for a frame pointer +; CHECK-LABEL: foo: +; CHECK: movq %rsp, %rbx + +; Check that %rbx is saved and restored around both lock cmpxchg16b. +; CHECK: movq %rbx, %r9 +; CHECK-NEXT: movabsq $1393743441367457520, %rcx # imm = 0x135792468ABCDEF0 +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: movq %rcx, %rbx +; CHECK-NEXT: lock cmpxchg16b (%r8) +; CHECK-NEXT: movq %r9, %rbx + +; CHECK: movq %rbx, %r9 +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: movq %rcx, %rbx +; CHECK-NEXT: lock cmpxchg16b (%r8) +; CHECK-NEXT: movq %r9, %rbx +bb: + %i = alloca %struct.SomeArrays, align 8 + %i1 = alloca %struct.SomeArrays, align 8 + %i2 = getelementptr inbounds %struct.TestStruct, %struct.TestStruct* %arg, i64 0, i32 0, i32 0 + %i3 = cmpxchg i128* %i2, i128 25710028567316702934644703134494809840, i128 25710028567316702934644703134494809840 seq_cst seq_cst + %i4 = extractvalue { i128, i1 } %i3, 0 + %i5 = trunc i128 %i4 to i64 + %i6 = icmp eq i64 %i5, 0 + br i1 %i6, label %bb9, label %bb7 + +bb7: ; preds = %bb + %i8 = cmpxchg i128* %i2, i128 25710028567316702934644703134494809840, i128 25710028567316702934644703134494809840 seq_cst seq_cst + br label %bb9 + +bb9: ; preds = %bb7, %bb + %i10 = bitcast %struct.SomeArrays* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 48, i8* nonnull %i10) + call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) %i10, i8 0, i64 48, i1 false) + %i11 = bitcast %struct.SomeArrays* %i1 to i8* + call void @llvm.lifetime.start.p0i8(i64 48, i8* nonnull %i11) + %i12 = bitcast %struct.SomeArrays* %i1 to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) %i12, i8 0, i64 48, i1 false) + %i13 = invoke nonnull align 8 dereferenceable(48) %struct.SomeArrays* @"??4SomeArrays@@QEAAAEAU0@$$QEAU0@@Z"(%struct.SomeArrays* nonnull %i, %struct.SomeArrays* nonnull align 8 dereferenceable(48) %i1) + to label %bb14 unwind label %bb45 + +bb14: ; preds = %bb9 + call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %i10) + ret void + +bb45: ; preds = %bb9 + %i46 = cleanuppad within none [] + %i47 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %i1, i64 0, i32 2, i32 0 + %i48 = load i8*, i8** %i47, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %i48) [ "funclet"(token %i46) ] + to label %bb51 unwind label %bb49 + +bb49: ; preds = %bb45 + %i50 = cleanuppad within %i46 [] + call void @__std_terminate() [ "funclet"(token %i50) ] + unreachable + +bb51: ; preds = %bb45 + %i52 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %i1, i64 0, i32 1, i32 0 + %i53 = load i8*, i8** %i52, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %i53) [ "funclet"(token %i46) ] + to label %bb56 unwind label %bb54 + +bb54: ; preds = %bb51 + %i55 = cleanuppad within %i46 [] + call void @__std_terminate() [ "funclet"(token %i55) ] + unreachable + +bb56: ; preds = %bb51 + call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %i10) + cleanupret from %i46 unwind to caller +} + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i1) + +declare dso_local i32 @__CxxFrameHandler3(...) + +declare nonnull align 8 dereferenceable(48) %struct.SomeArrays* @"??4SomeArrays@@QEAAAEAU0@$$QEAU0@@Z"(%struct.SomeArrays*, %struct.SomeArrays* nonnull align 8 dereferenceable(48)) align 2 + +declare void @"?free@@YAXPEAX@Z"(i8*) + +declare void @__std_terminate()