Index: include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- include/llvm/CodeGen/TargetInstrInfo.h +++ include/llvm/CodeGen/TargetInstrInfo.h @@ -225,6 +225,17 @@ return 0; } + /// Optional extension of isLoadFromStackSlot that returns the number of + /// bytes loaded from the stack. This must be implemented if a backend + /// supports partial stack slot spills/loads to further disambiguate + /// what the load does. + virtual unsigned isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + MemBytes = 0; + return isLoadFromStackSlot(MI, FrameIndex); + } + /// Check for post-frame ptr elimination stack locations as well. /// This uses a heuristic so it isn't reliable for correctness. virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI, @@ -252,6 +263,17 @@ return 0; } + /// Optional extension of isStoreToStackSlot that returns the number of + /// bytes stored to the stack. This must be implemented if a backend + /// supports partial stack slot spills/loads to further disambiguate + /// what the store does. + virtual unsigned isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + MemBytes = 0; + return isStoreToStackSlot(MI, FrameIndex); + } + /// Check for post-frame ptr elimination stack locations as well. /// This uses a heuristic, so it isn't reliable for correctness. virtual unsigned isStoreToStackSlotPostFE(const MachineInstr &MI, Index: lib/CodeGen/StackSlotColoring.cpp =================================================================== --- lib/CodeGen/StackSlotColoring.cpp +++ lib/CodeGen/StackSlotColoring.cpp @@ -418,7 +418,9 @@ unsigned LoadReg = 0; unsigned StoreReg = 0; - if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS))) + unsigned LoadSize = 0; + unsigned StoreSize = 0; + if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS, LoadSize))) continue; // Skip the ...pseudo debugging... instructions between a load and store. while ((NextMI != E) && NextMI->isDebugValue()) { @@ -426,9 +428,11 @@ ++I; } if (NextMI == E) continue; - if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS))) + if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS, StoreSize))) + continue; + if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1 || + LoadSize != StoreSize) continue; - if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue; ++NumDead; changed = true; Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -232,6 +232,9 @@ unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; + unsigned isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const override; /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't /// reliable for correctness. @@ -240,6 +243,9 @@ unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const override; /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't /// reliable for correctness. Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -3946,17 +3946,21 @@ case X86::MOV8rm: case X86::MOV16rm: case X86::MOV32rm: + case X86::MOVSSrm: + case X86::VMOVSSZrm: case X86::MOV64rm: case X86::LD_Fp64m: - case X86::MOVSSrm: case X86::MOVSDrm: + case X86::VMOVSSrm: + case X86::VMOVSDZrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::VMOVSSrm: case X86::VMOVSDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: @@ -3970,10 +3974,93 @@ case X86::VMOVAPDYrm: case X86::VMOVDQUYrm: case X86::VMOVDQAYrm: - case X86::MMX_MOVD64rm: - case X86::MMX_MOVQ64rm: + case X86::VMOVAPSZrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVAPSZ256rm_NOVLX: + case X86::VMOVUPSZrm: + case X86::VMOVUPSZ128rm: + case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: + case X86::VMOVAPDZrm: + case X86::VMOVAPDZ128rm: + case X86::VMOVAPDZ256rm: + case X86::VMOVUPDZrm: + case X86::VMOVUPDZ128rm: + case X86::VMOVUPDZ256rm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQU64Zrm: + case X86::VMOVDQU64Z128rm: + case X86::VMOVDQU64Z256rm: + case X86::VMOVDQU8Zrm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU16Zrm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQU16Z256rm: + case X86::KMOVBkm: + case X86::KMOVWkm: + case X86::KMOVDkm: + case X86::KMOVQkm: + return true; + } +} + +// Is frame load, but with load size information +static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes, + unsigned RegSize) { + switch (Opcode) { + default: + return false; + case X86::MOV8rm: + MemBytes = 1; + return true; + case X86::MOV16rm: + MemBytes = 2; + return true; + case X86::MOV32rm: + case X86::MOVSSrm: case X86::VMOVSSZrm: + MemBytes = 4; + return true; + case X86::MOV64rm: + case X86::LD_Fp64m: + case X86::MOVSDrm: + case X86::VMOVSSrm: case X86::VMOVSDZrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + MemBytes = 8; + return true; + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVUPDrm: + case X86::MOVDQArm: + case X86::MOVDQUrm: + case X86::VMOVSDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVUPDrm: + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPDYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQUYrm: + case X86::VMOVDQAYrm: case X86::VMOVAPSZrm: case X86::VMOVAPSZ128rm: case X86::VMOVAPSZ256rm: @@ -4012,28 +4099,39 @@ case X86::KMOVWkm: case X86::KMOVDkm: case X86::KMOVQkm: + MemBytes = RegSize; return true; } } static bool isFrameStoreOpcode(int Opcode) { switch (Opcode) { - default: break; + default: + return false; case X86::MOV8mr: + case X86::KMOVBmk: case X86::MOV16mr: + case X86::KMOVWmk: case X86::MOV32mr: + case X86::MOVSSmr: + case X86::VMOVSSmr: + case X86::VMOVSSZmr: + case X86::KMOVDmk: case X86::MOV64mr: case X86::ST_FpP64m: - case X86::MOVSSmr: case X86::MOVSDmr: + case X86::VMOVSDmr: + case X86::VMOVSDZmr: + case X86::MMX_MOVD64mr: + case X86::MMX_MOVQ64mr: + case X86::MMX_MOVNTQmr: + case X86::KMOVQmk: case X86::MOVAPSmr: case X86::MOVUPSmr: case X86::MOVAPDmr: case X86::MOVUPDmr: case X86::MOVDQAmr: case X86::MOVDQUmr: - case X86::VMOVSSmr: - case X86::VMOVSDmr: case X86::VMOVAPSmr: case X86::VMOVUPSmr: case X86::VMOVAPDmr: @@ -4046,8 +4144,6 @@ case X86::VMOVAPDYmr: case X86::VMOVDQUYmr: case X86::VMOVDQAYmr: - case X86::VMOVSSZmr: - case X86::VMOVSDZmr: case X86::VMOVUPSZmr: case X86::VMOVUPSZ128mr: case X86::VMOVUPSZ256mr: @@ -4082,13 +4178,96 @@ case X86::VMOVDQU16Zmr: case X86::VMOVDQU16Z128mr: case X86::VMOVDQU16Z256mr: - case X86::MMX_MOVD64mr: - case X86::MMX_MOVQ64mr: - case X86::MMX_MOVNTQmr: + return true; + } + return false; +} + +// Is frame store, but with store size information +static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes, + unsigned RegSize) { + switch (Opcode) { + default: + return false; + case X86::MOV8mr: case X86::KMOVBmk: + MemBytes = 1; + return true; + case X86::MOV16mr: case X86::KMOVWmk: + MemBytes = 2; + return true; + case X86::MOV32mr: + case X86::MOVSSmr: + case X86::VMOVSSmr: + case X86::VMOVSSZmr: case X86::KMOVDmk: + MemBytes = 4; + return true; + case X86::MOV64mr: + case X86::ST_FpP64m: + case X86::MOVSDmr: + case X86::VMOVSDmr: + case X86::VMOVSDZmr: + case X86::MMX_MOVD64mr: + case X86::MMX_MOVQ64mr: + case X86::MMX_MOVNTQmr: case X86::KMOVQmk: + MemBytes = 8; + return true; + case X86::MOVAPSmr: + case X86::MOVUPSmr: + case X86::MOVAPDmr: + case X86::MOVUPDmr: + case X86::MOVDQAmr: + case X86::MOVDQUmr: + case X86::VMOVAPSmr: + case X86::VMOVUPSmr: + case X86::VMOVAPDmr: + case X86::VMOVUPDmr: + case X86::VMOVDQAmr: + case X86::VMOVDQUmr: + case X86::VMOVUPSYmr: + case X86::VMOVAPSYmr: + case X86::VMOVUPDYmr: + case X86::VMOVAPDYmr: + case X86::VMOVDQUYmr: + case X86::VMOVDQAYmr: + case X86::VMOVUPSZmr: + case X86::VMOVUPSZ128mr: + case X86::VMOVUPSZ256mr: + case X86::VMOVUPSZ128mr_NOVLX: + case X86::VMOVUPSZ256mr_NOVLX: + case X86::VMOVAPSZmr: + case X86::VMOVAPSZ128mr: + case X86::VMOVAPSZ256mr: + case X86::VMOVAPSZ128mr_NOVLX: + case X86::VMOVAPSZ256mr_NOVLX: + case X86::VMOVUPDZmr: + case X86::VMOVUPDZ128mr: + case X86::VMOVUPDZ256mr: + case X86::VMOVAPDZmr: + case X86::VMOVAPDZ128mr: + case X86::VMOVAPDZ256mr: + case X86::VMOVDQA32Zmr: + case X86::VMOVDQA32Z128mr: + case X86::VMOVDQA32Z256mr: + case X86::VMOVDQU32Zmr: + case X86::VMOVDQU32Z128mr: + case X86::VMOVDQU32Z256mr: + case X86::VMOVDQA64Zmr: + case X86::VMOVDQA64Z128mr: + case X86::VMOVDQA64Z256mr: + case X86::VMOVDQU64Zmr: + case X86::VMOVDQU64Z128mr: + case X86::VMOVDQU64Z256mr: + case X86::VMOVDQU8Zmr: + case X86::VMOVDQU8Z128mr: + case X86::VMOVDQU8Z256mr: + case X86::VMOVDQU16Zmr: + case X86::VMOVDQU16Z128mr: + case X86::VMOVDQU16Z256mr: + MemBytes = RegSize; return true; } return false; @@ -4102,6 +4281,22 @@ return 0; } +unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + if (isFrameLoadOpcode(MI.getOpcode())) { + if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) { + unsigned Reg = MI.getOperand(0).getReg(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + unsigned RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8; + isFrameLoadOpcode(MI.getOpcode(), MemBytes, RegSize); + return Reg; + } + } + return 0; +} + unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { if (isFrameLoadOpcode(MI.getOpcode())) { @@ -4124,6 +4319,23 @@ return 0; } +unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + if (isFrameStoreOpcode(MI.getOpcode())) { + if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 && + isFrameOperand(MI, 0, FrameIndex)) { + unsigned Reg = MI.getOperand(X86::AddrNumOperands).getReg(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + unsigned RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8; + isFrameStoreOpcode(MI.getOpcode(), MemBytes, RegSize); + return Reg; + } + } + return 0; +} + unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { if (isFrameStoreOpcode(MI.getOpcode())) { Index: test/CodeGen/Generic/pr30821.ll =================================================================== --- /dev/null +++ test/CodeGen/Generic/pr30821.ll @@ -0,0 +1,94 @@ +; RUN: lli -O1 < %s | FileCheck %s + +; CHECK: 23.000000 24.000000 -> 23.000000 0.000000 + +source_filename = "pr30821.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [12 x i8] c"%lf %lf -> \00", align 1 +@.str.1 = private unnamed_addr constant [9 x i8] c"%lf %lf\0A\00", align 1 + +; Function Attrs: norecurse nounwind uwtable +define dso_local i32 @main() local_unnamed_addr #0 { +entry: + %alpha = alloca i8, align 1 + %foxtrot = alloca <2 x double>, align 16 + %india = alloca <2 x double>, align 16 + call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %alpha) + store volatile i8 1, i8* %alpha, align 1 + %alpha.0.alpha.0. = load volatile i8, i8* %alpha, align 1 + %0 = bitcast <2 x double>* %foxtrot to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #4 + %cmp23 = icmp eq i8 %alpha.0.alpha.0., 0 + br i1 %cmp23, label %for.cond.cleanup, label %for.cond2.preheader + +for.cond2.preheader: ; preds = %entry, %for.cond.cleanup5 + %charlie.024 = phi i8 [ %inc8, %for.cond.cleanup5 ], [ 0, %entry ] + br label %for.body6 + +for.cond.cleanup: ; preds = %for.cond.cleanup5, %entry + %india.0.india.0..sroa_cast = bitcast <2 x double>* %india to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %india.0.india.0..sroa_cast) + store volatile <2 x double> , <2 x double>* %india, align 16 + %india.0.india.0. = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecext = extractelement <2 x double> %india.0.india.0., i32 0 + %india.0.india.0.14 = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecext10 = extractelement <2 x double> %india.0.india.0.14, i32 1 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i64 0, i64 0), double %vecext, double %vecext10) + %india.0.india.0.15 = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecins = insertelement <2 x double> %india.0.india.0.15, double 0.000000e+00, i32 1 + store volatile <2 x double> %vecins, <2 x double>* %india, align 16 + %india.0.india.0.16 = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecext11 = extractelement <2 x double> %india.0.india.0.16, i32 0 + %india.0.india.0.17 = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecext12 = extractelement <2 x double> %india.0.india.0.17, i32 1 + %call13 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.1, i64 0, i64 0), double %vecext11, double %vecext12) + call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %india.0.india.0..sroa_cast) + call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #4 + call void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %alpha) + ret i32 0 + +for.cond.cleanup5: ; preds = %for.body6 + %inc8 = add nuw i8 %charlie.024, 1 + %exitcond25 = icmp eq i8 %inc8, %alpha.0.alpha.0. + br i1 %exitcond25, label %for.cond.cleanup, label %for.cond2.preheader + +for.body6: ; preds = %for.body6, %for.cond2.preheader + %golf.022 = phi i8 [ 0, %for.cond2.preheader ], [ %inc, %for.body6 ] + tail call void asm sideeffect "nop", "~{ebx},~{r8},~{r9},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() #4 + call fastcc void @_ZL4inithPvj(i8* nonnull %0) + %inc = add nuw nsw i8 %golf.022, 1 + %exitcond = icmp eq i8 %inc, 2 + br i1 %exitcond, label %for.cond.cleanup5, label %for.body6 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: noinline norecurse nounwind uwtable +define internal fastcc void @_ZL4inithPvj(i8* nocapture %data) unnamed_addr #2 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %0 = trunc i32 %i.01 to i8 + %conv1 = add i8 %0, -16 + %arrayidx = getelementptr inbounds i8, i8* %data, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %inc = add nuw nsw i32 %i.01, 1 + %cmp = icmp eq i64 %indvars.iv.next, 16 + br i1 %cmp, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #3