Index: llvm/lib/Target/X86/X86ExpandPseudo.cpp =================================================================== --- llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -338,9 +338,9 @@ // Perform the following transformation. // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx // => - // [E|R]BX = InArg + // RBX = InArg // actualcmpxchg Addr - // [E|R]BX = SaveRbx + // RBX = SaveRbx const MachineOperand &InArg = MBBI->getOperand(6); Register SaveRbx = MBBI->getOperand(7).getReg(); Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -756,7 +756,6 @@ LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, LCMPXCHG16_DAG, - LCMPXCHG8_SAVE_EBX_DAG, LCMPXCHG16_SAVE_RBX_DAG, /// LOCK-prefixed arithmetic read-modify-write instructions. Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30481,38 +30481,30 @@ swapInH = DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, swapInH, cpInH.getValue(1)); - // If the current function needs the base pointer, RBX, - // we shouldn't use cmpxchg directly. - // Indeed the lowering of that instruction will clobber - // that register and since RBX will be a reserved register - // the register allocator will not make sure its value will - // be properly saved and restored around this live-range. - const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + + // In 64-bit mode we might need the base pointer in RBX, but we can't know + // until later. So we keep the RBX input in a vreg and use a custom + // inserter. + // Since RBX will be a reserved register the register allocator will not + // make sure its value will be properly saved and restored around this + // live-range. SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - Register BasePtr = TRI->getBaseRegister(); MachineMemOperand *MMO = cast(N)->getMemOperand(); - if (TRI->hasBasePointer(DAG.getMachineFunction()) && - (BasePtr == X86::RBX || BasePtr == X86::EBX)) { - assert(Regs64bit && "RBX/EBX base pointer only expected for i128 CAS"); - SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl, - X86::RBX, - HalfT, swapInH.getValue(1)); - SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL, - RBXSave, - /*Glue*/ RBXSave.getValue(2)}; - Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_SAVE_RBX_DAG, dl, Tys, - Ops, T, MMO); + if (Regs64bit) { + SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL, + swapInH.getValue(1)}; + Result = + DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO); } else { - unsigned Opcode = - Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; - swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, - Regs64bit ? X86::RBX : X86::EBX, swapInL, + swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL, swapInH.getValue(1)); SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), swapInL.getValue(1)}; - Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); + Result = + DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO); } + SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -30811,7 +30803,6 @@ NODE_NAME_CASE(LCMPXCHG_DAG) NODE_NAME_CASE(LCMPXCHG8_DAG) NODE_NAME_CASE(LCMPXCHG16_DAG) - NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG) NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG) NODE_NAME_CASE(LADD) NODE_NAME_CASE(LSUB) @@ -33770,11 +33761,32 @@ return BB; } - case X86::LCMPXCHG16B: - return BB; - case X86::LCMPXCHG16B_SAVE_RBX: { - if (!BB->isLiveIn(X86::RBX)) - BB->addLiveIn(X86::RBX); + case X86::LCMPXCHG16B_NO_RBX: { + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + Register BasePtr = TRI->getBaseRegister(); + if (TRI->hasBasePointer(*MF) && + (BasePtr == X86::RBX || BasePtr == X86::EBX)) { + if (!BB->isLiveIn(BasePtr)) + BB->addLiveIn(BasePtr); + // Save RBX into a virtual register. + Register SaveRBX = + MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX) + .addReg(X86::RBX); + Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); + X86AddressMode AM = getAddressFromInstr(&MI, 0); + addFullAddress( + BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst), AM) + .add(MI.getOperand(X86::AddrNumOperands)) + .addReg(SaveRBX); + } else { + // Simple case, just copy the virtual register to RBX. + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX) + .addReg(MI.getOperand(X86::AddrNumOperands).getReg()); + X86AddressMode AM = getAddressFromInstr(&MI, 0); + addFullAddress(BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B)), AM); + } + MI.eraseFromParent(); return BB; } case X86::MWAITX: { Index: llvm/lib/Target/X86/X86InstrCompiler.td =================================================================== --- llvm/lib/Target/X86/X86InstrCompiler.td +++ llvm/lib/Target/X86/X86InstrCompiler.td @@ -809,15 +809,6 @@ } // Atomic compare and swap. -multiclass LCMPXCHG_UnOp Opc, Format Form, string mnemonic, - SDPatternOperator frag, X86MemOperand x86memop> { -let isCodeGenOnly = 1, usesCustomInserter = 1 in { - def NAME : I, TB, LOCK; -} -} - multiclass LCMPXCHG_BinOp Opc8, bits<8> Opc, Format Form, string mnemonic, SDPatternOperator frag> { let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in { @@ -841,14 +832,19 @@ } let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], - Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in { -defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>; + Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, usesCustomInserter = 1 in { +def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), + "cmpxchg8b\t$ptr", + [(X86cas8 addr:$ptr)]>, TB, LOCK; } let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], - Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in { -defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", - X86cas16, i128mem>, REX_W; + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in { +def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), + "cmpxchg16b\t$ptr", + []>, TB, LOCK; } // This pseudo must be used when the frame uses RBX as @@ -872,14 +868,28 @@ // the value of RBX. let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX], Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], - isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst", - usesCustomInserter = 1 in { + isCodeGenOnly = 1, isPseudo = 1, + mayLoad = 1, mayStore = 1, hasSideEffects = 0, + Constraints = "$rbx_save = $dst" in { def LCMPXCHG16B_SAVE_RBX : I<0, Pseudo, (outs GR64:$dst), (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), - !strconcat("cmpxchg16b", "\t$ptr"), - [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input, - GR64:$rbx_save))]>; + "cmpxchg16b\t$ptr", + []>; +} + +// Pseudo instruction that doesn't read/write RBX. Will be turned into either +// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter. +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX], + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, isPseudo = 1, + mayLoad = 1, mayStore = 1, hasSideEffects = 0, + usesCustomInserter = 1 in { +def LCMPXCHG16B_NO_RBX : + I<0, Pseudo, (outs), + (ins i128mem:$ptr, GR64:$rbx_input), + "cmpxchg16b\t$ptr", + [(X86cas16 addr:$ptr, GR64:$rbx_input)]>; } // This pseudo must be used when the frame uses RBX/EBX as Index: llvm/lib/Target/X86/X86InstrInfo.td =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.td +++ llvm/lib/Target/X86/X86InstrInfo.td @@ -69,10 +69,8 @@ def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, SDTCisVT<2, i8>]>; -def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; -def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3, - [SDTCisVT<0, i64>, SDTCisPtrTy<1>, - SDTCisVT<2, i64>, SDTCisVT<3, i64>]>; +def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>; def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, @@ -171,16 +169,12 @@ def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair, +def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, +def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG", - SDTX86caspairSaveRbx16, - [SDNPHasChain, SDNPInGlue, SDNPOutGlue, - SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; Index: llvm/test/CodeGen/X86/pr42064.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/pr42064.ll @@ -0,0 +1,204 @@ +; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc19.11.0 -mattr=+avx,+cx16 | FileCheck %s + +%struct.TestStruct = type { %union.Int128 } +%union.Int128 = type { i128 } +%struct.SomeArrays = type { %struct.SillyArray, %struct.SillyArray, %struct.SillyArray } +%struct.SillyArray = type { i8*, i32, i32 } + +declare void @llvm.lifetime.start.p0i8(i64, i8*) + +define void @foo(%struct.TestStruct* %0) align 2 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +; CHECK-LABEL: foo: +; CHECK: movq %rbx, %r9 +; CHECK-NEXT: movabsq $1393743441367457520, %rcx # imm = 0x135792468ABCDEF0 +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: movq %rcx, %rbx +; CHECK-NEXT: lock cmpxchg16b (%r8) +; CHECK-NEXT: movq %r9, %rbx + +; CHECK: movq %rbx, %r9 +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: movq %rcx, %rbx +; CHECK-NEXT: lock cmpxchg16b (%r8) +; CHECK-NEXT: movq %r9, %rbx + %2 = alloca %struct.SomeArrays, align 8 + %3 = alloca %struct.SomeArrays, align 8 + %4 = getelementptr inbounds %struct.TestStruct, %struct.TestStruct* %0, i64 0, i32 0, i32 0 + %5 = cmpxchg i128* %4, i128 25710028567316702934644703134494809840, i128 25710028567316702934644703134494809840 seq_cst seq_cst + %6 = extractvalue { i128, i1 } %5, 0 + %7 = trunc i128 %6 to i64 + %8 = icmp eq i64 %7, 0 + br i1 %8, label %11, label %9 + +9: ; preds = %1 + %10 = cmpxchg i128* %4, i128 25710028567316702934644703134494809840, i128 25710028567316702934644703134494809840 seq_cst seq_cst + br label %11 + +11: ; preds = %9, %1 + %12 = bitcast %struct.SomeArrays* %2 to i8* + call void @llvm.lifetime.start.p0i8(i64 48, i8* nonnull %12) + call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) %12, i8 0, i64 48, i1 false) + %13 = bitcast %struct.SomeArrays* %3 to i8* + call void @llvm.lifetime.start.p0i8(i64 48, i8* nonnull %13) + %14 = bitcast %struct.SomeArrays* %3 to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) %14, i8 0, i64 48, i1 false) + %15 = invoke nonnull align 8 dereferenceable(48) %struct.SomeArrays* @"??4SomeArrays@@QEAAAEAU0@$$QEAU0@@Z"(%struct.SomeArrays* nonnull %2, %struct.SomeArrays* nonnull align 8 dereferenceable(48) %3) + to label %16 unwind label %47 + +16: ; preds = %11 + %17 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %3, i64 0, i32 2, i32 0 + %18 = load i8*, i8** %17, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %18) + to label %21 unwind label %19 + +19: ; preds = %16 + %20 = cleanuppad within none [] + call void @__std_terminate() [ "funclet"(token %20) ] + unreachable + +21: ; preds = %16 + %22 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %3, i64 0, i32 1, i32 0 + %23 = load i8*, i8** %22, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %23) + to label %26 unwind label %24 + +24: ; preds = %21 + %25 = cleanuppad within none [] + call void @__std_terminate() [ "funclet"(token %25) ] + unreachable + +26: ; preds = %21 + %27 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %3, i64 0, i32 0, i32 0 + %28 = load i8*, i8** %27, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %28) + to label %31 unwind label %29 + +29: ; preds = %26 + %30 = cleanuppad within none [] + call void @__std_terminate() [ "funclet"(token %30) ] + unreachable + +31: ; preds = %26 + call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %13) + %32 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %2, i64 0, i32 2, i32 0 + %33 = load i8*, i8** %32, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %33) + to label %36 unwind label %34 + +34: ; preds = %31 + %35 = cleanuppad within none [] + call void @__std_terminate() [ "funclet"(token %35) ] + unreachable + +36: ; preds = %31 + %37 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %2, i64 0, i32 1, i32 0 + %38 = load i8*, i8** %37, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %38) + to label %41 unwind label %39 + +39: ; preds = %36 + %40 = cleanuppad within none [] + call void @__std_terminate() [ "funclet"(token %40) ] + unreachable + +41: ; preds = %36 + %42 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %2, i64 0, i32 0, i32 0 + %43 = load i8*, i8** %42, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %43) + to label %46 unwind label %44 + +44: ; preds = %41 + %45 = cleanuppad within none [] + call void @__std_terminate() [ "funclet"(token %45) ] + unreachable + +46: ; preds = %41 + call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %12) + ret void + +47: ; preds = %11 + %48 = cleanuppad within none [] + %49 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %3, i64 0, i32 2, i32 0 + %50 = load i8*, i8** %49, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %50) [ "funclet"(token %48) ] + to label %53 unwind label %51 + +51: ; preds = %47 + %52 = cleanuppad within %48 [] + call void @__std_terminate() [ "funclet"(token %52) ] + unreachable + +53: ; preds = %47 + %54 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %3, i64 0, i32 1, i32 0 + %55 = load i8*, i8** %54, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %55) [ "funclet"(token %48) ] + to label %58 unwind label %56 + +56: ; preds = %53 + %57 = cleanuppad within %48 [] + call void @__std_terminate() [ "funclet"(token %57) ] + unreachable + +58: ; preds = %53 + %59 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %3, i64 0, i32 0, i32 0 + %60 = load i8*, i8** %59, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %60) [ "funclet"(token %48) ] + to label %63 unwind label %61 + +61: ; preds = %58 + %62 = cleanuppad within %48 [] + call void @__std_terminate() [ "funclet"(token %62) ] + unreachable + +63: ; preds = %58 + call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %13) + %64 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %2, i64 0, i32 2, i32 0 + %65 = load i8*, i8** %64, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %65) [ "funclet"(token %48) ] + to label %68 unwind label %66 + +66: ; preds = %63 + %67 = cleanuppad within %48 [] + call void @__std_terminate() [ "funclet"(token %67) ] + unreachable + +68: ; preds = %63 + %69 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %2, i64 0, i32 1, i32 0 + %70 = load i8*, i8** %69, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %70) [ "funclet"(token %48) ] + to label %73 unwind label %71 + +71: ; preds = %68 + %72 = cleanuppad within %48 [] + call void @__std_terminate() [ "funclet"(token %72) ] + unreachable + +73: ; preds = %68 + %74 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %2, i64 0, i32 0, i32 0 + %75 = load i8*, i8** %74, align 8 + invoke void @"?free@@YAXPEAX@Z"(i8* %75) [ "funclet"(token %48) ] + to label %78 unwind label %76 + +76: ; preds = %73 + %77 = cleanuppad within %48 [] + call void @__std_terminate() [ "funclet"(token %77) ] + unreachable + +78: ; preds = %73 + call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %12) + cleanupret from %48 unwind to caller +} + +declare void @llvm.lifetime.end.p0i8(i64, i8*) + +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i1) + +declare dso_local i32 @__CxxFrameHandler3(...) + +declare nonnull align 8 dereferenceable(48) %struct.SomeArrays* @"??4SomeArrays@@QEAAAEAU0@$$QEAU0@@Z"(%struct.SomeArrays*, %struct.SomeArrays* nonnull align 8 dereferenceable(48)) align 2 + +declare void @"?free@@YAXPEAX@Z"(i8*) + +declare void @__std_terminate()