Index: include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- include/llvm/CodeGen/TargetInstrInfo.h +++ include/llvm/CodeGen/TargetInstrInfo.h @@ -225,6 +225,17 @@ return 0; } + /// Optional extension of isLoadFromStackSlot that returns the number of + /// bytes loaded from the stack. This must be implemented if a backend + /// supports partial stack slot spills/loads to further disambiguate + /// what the load does. + virtual unsigned isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + MemBytes = 0; + return isLoadFromStackSlot(MI, FrameIndex); + } + /// Check for post-frame ptr elimination stack locations as well. /// This uses a heuristic so it isn't reliable for correctness. virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI, @@ -252,6 +263,17 @@ return 0; } + /// Optional extension of isStoreToStackSlot that returns the number of + /// bytes stored to the stack. This must be implemented if a backend + /// supports partial stack slot spills/loads to further disambiguate + /// what the store does. + virtual unsigned isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + MemBytes = 0; + return isStoreToStackSlot(MI, FrameIndex); + } + /// Check for post-frame ptr elimination stack locations as well. /// This uses a heuristic, so it isn't reliable for correctness. virtual unsigned isStoreToStackSlotPostFE(const MachineInstr &MI, Index: lib/CodeGen/StackSlotColoring.cpp =================================================================== --- lib/CodeGen/StackSlotColoring.cpp +++ lib/CodeGen/StackSlotColoring.cpp @@ -418,7 +418,9 @@ unsigned LoadReg = 0; unsigned StoreReg = 0; - if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS))) + unsigned LoadSize = 0; + unsigned StoreSize = 0; + if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS, LoadSize))) continue; // Skip the ...pseudo debugging... instructions between a load and store. while ((NextMI != E) && NextMI->isDebugValue()) { @@ -426,9 +428,11 @@ ++I; } if (NextMI == E) continue; - if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS))) + if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS, StoreSize))) + continue; + if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1 || + LoadSize != StoreSize) continue; - if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue; ++NumDead; changed = true; Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -232,6 +232,9 @@ unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; + unsigned isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const override; /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't /// reliable for correctness. @@ -240,6 +243,9 @@ unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const override; /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't /// reliable for correctness. Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -3939,24 +3939,40 @@ return false; } -static bool isFrameLoadOpcode(int Opcode) { +static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) { switch (Opcode) { default: return false; case X86::MOV8rm: + case X86::KMOVBkm: + MemBytes = 1; + return true; case X86::MOV16rm: + case X86::KMOVWkm: + MemBytes = 2; + return true; case X86::MOV32rm: + case X86::MOVSSrm: + case X86::VMOVSSZrm: + case X86::KMOVDkm: + MemBytes = 4; + return true; case X86::MOV64rm: case X86::LD_Fp64m: - case X86::MOVSSrm: case X86::MOVSDrm: + case X86::VMOVSSrm: + case X86::VMOVSDZrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::KMOVQkm: + MemBytes = 8; + return true; case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::VMOVSSrm: case X86::VMOVSDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: @@ -3964,131 +3980,142 @@ case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: - case X86::VMOVUPSYrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVUPSZ128rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVAPDZ128rm: + case X86::VMOVUPDZ128rm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQU64Z128rm: + MemBytes = 16; + return true; case X86::VMOVAPSYrm: - case X86::VMOVUPDYrm: + case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: - case X86::VMOVDQUYrm: + case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: - case X86::MMX_MOVD64rm: - case X86::MMX_MOVQ64rm: - case X86::VMOVSSZrm: - case X86::VMOVSDZrm: - case X86::VMOVAPSZrm: - case X86::VMOVAPSZ128rm: + case X86::VMOVDQUYrm: case X86::VMOVAPSZ256rm: - case X86::VMOVAPSZ128rm_NOVLX: - case X86::VMOVAPSZ256rm_NOVLX: - case X86::VMOVUPSZrm: - case X86::VMOVUPSZ128rm: case X86::VMOVUPSZ256rm: - case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVUPSZ256rm_NOVLX: - case X86::VMOVAPDZrm: - case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: - case X86::VMOVUPDZrm: - case X86::VMOVUPDZ128rm: case X86::VMOVUPDZ256rm: - case X86::VMOVDQA32Zrm: - case X86::VMOVDQA32Z128rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU16Z256rm: case X86::VMOVDQA32Z256rm: - case X86::VMOVDQU32Zrm: - case X86::VMOVDQU32Z128rm: case X86::VMOVDQU32Z256rm: - case X86::VMOVDQA64Zrm: - case X86::VMOVDQA64Z128rm: case X86::VMOVDQA64Z256rm: - case X86::VMOVDQU64Zrm: - case X86::VMOVDQU64Z128rm: case X86::VMOVDQU64Z256rm: + MemBytes = 32; + return true; + case X86::VMOVAPSZrm: + case X86::VMOVUPSZrm: + case X86::VMOVAPDZrm: + case X86::VMOVUPDZrm: case X86::VMOVDQU8Zrm: - case X86::VMOVDQU8Z128rm: - case X86::VMOVDQU8Z256rm: case X86::VMOVDQU16Zrm: - case X86::VMOVDQU16Z128rm: - case X86::VMOVDQU16Z256rm: - case X86::KMOVBkm: - case X86::KMOVWkm: - case X86::KMOVDkm: - case X86::KMOVQkm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU64Zrm: + MemBytes = 64; return true; } } -static bool isFrameStoreOpcode(int Opcode) { +static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) { switch (Opcode) { - default: break; + default: + return false; case X86::MOV8mr: + case X86::KMOVBmk: + MemBytes = 1; + return true; case X86::MOV16mr: + case X86::KMOVWmk: + MemBytes = 2; + return true; case X86::MOV32mr: + case X86::MOVSSmr: + case X86::VMOVSSmr: + case X86::VMOVSSZmr: + case X86::KMOVDmk: + MemBytes = 4; + return true; case X86::MOV64mr: case X86::ST_FpP64m: - case X86::MOVSSmr: case X86::MOVSDmr: + case X86::VMOVSDmr: + case X86::VMOVSDZmr: + case X86::MMX_MOVD64mr: + case X86::MMX_MOVQ64mr: + case X86::MMX_MOVNTQmr: + case X86::KMOVQmk: + MemBytes = 8; + return true; case X86::MOVAPSmr: case X86::MOVUPSmr: case X86::MOVAPDmr: case X86::MOVUPDmr: case X86::MOVDQAmr: case X86::MOVDQUmr: - case X86::VMOVSSmr: - case X86::VMOVSDmr: case X86::VMOVAPSmr: case X86::VMOVUPSmr: case X86::VMOVAPDmr: case X86::VMOVUPDmr: case X86::VMOVDQAmr: case X86::VMOVDQUmr: + case X86::VMOVUPSZ128mr: + case X86::VMOVAPSZ128mr: + case X86::VMOVUPSZ128mr_NOVLX: + case X86::VMOVAPSZ128mr_NOVLX: + case X86::VMOVUPDZ128mr: + case X86::VMOVAPDZ128mr: + case X86::VMOVDQA32Z128mr: + case X86::VMOVDQU32Z128mr: + case X86::VMOVDQA64Z128mr: + case X86::VMOVDQU64Z128mr: + case X86::VMOVDQU8Z128mr: + case X86::VMOVDQU16Z128mr: + MemBytes = 16; + return true; case X86::VMOVUPSYmr: case X86::VMOVAPSYmr: case X86::VMOVUPDYmr: case X86::VMOVAPDYmr: case X86::VMOVDQUYmr: case X86::VMOVDQAYmr: - case X86::VMOVSSZmr: - case X86::VMOVSDZmr: - case X86::VMOVUPSZmr: - case X86::VMOVUPSZ128mr: case X86::VMOVUPSZ256mr: - case X86::VMOVUPSZ128mr_NOVLX: - case X86::VMOVUPSZ256mr_NOVLX: - case X86::VMOVAPSZmr: - case X86::VMOVAPSZ128mr: case X86::VMOVAPSZ256mr: - case X86::VMOVAPSZ128mr_NOVLX: + case X86::VMOVUPSZ256mr_NOVLX: case X86::VMOVAPSZ256mr_NOVLX: - case X86::VMOVUPDZmr: - case X86::VMOVUPDZ128mr: case X86::VMOVUPDZ256mr: - case X86::VMOVAPDZmr: - case X86::VMOVAPDZ128mr: case X86::VMOVAPDZ256mr: - case X86::VMOVDQA32Zmr: - case X86::VMOVDQA32Z128mr: + case X86::VMOVDQU8Z256mr: + case X86::VMOVDQU16Z256mr: case X86::VMOVDQA32Z256mr: - case X86::VMOVDQU32Zmr: - case X86::VMOVDQU32Z128mr: case X86::VMOVDQU32Z256mr: - case X86::VMOVDQA64Zmr: - case X86::VMOVDQA64Z128mr: case X86::VMOVDQA64Z256mr: - case X86::VMOVDQU64Zmr: - case X86::VMOVDQU64Z128mr: case X86::VMOVDQU64Z256mr: + MemBytes = 32; + return true; + case X86::VMOVUPSZmr: + case X86::VMOVAPSZmr: + case X86::VMOVUPDZmr: + case X86::VMOVAPDZmr: case X86::VMOVDQU8Zmr: - case X86::VMOVDQU8Z128mr: - case X86::VMOVDQU8Z256mr: case X86::VMOVDQU16Zmr: - case X86::VMOVDQU16Z128mr: - case X86::VMOVDQU16Z256mr: - case X86::MMX_MOVD64mr: - case X86::MMX_MOVQ64mr: - case X86::MMX_MOVNTQmr: - case X86::KMOVBmk: - case X86::KMOVWmk: - case X86::KMOVDmk: - case X86::KMOVQmk: + case X86::VMOVDQA32Zmr: + case X86::VMOVDQU32Zmr: + case X86::VMOVDQA64Zmr: + case X86::VMOVDQU64Zmr: + MemBytes = 64; return true; } return false; @@ -4096,7 +4123,14 @@ unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameLoadOpcode(MI.getOpcode())) + unsigned dummy; + return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, dummy); +} + +unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + if (isFrameLoadOpcode(MI.getOpcode(), MemBytes)) if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) return MI.getOperand(0).getReg(); return 0; @@ -4104,7 +4138,8 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameLoadOpcode(MI.getOpcode())) { + unsigned dummy; + if (isFrameLoadOpcode(MI.getOpcode(), dummy)) { unsigned Reg; if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) return Reg; @@ -4117,7 +4152,14 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameStoreOpcode(MI.getOpcode())) + unsigned dummy; + return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, dummy); +} + +unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + if (isFrameStoreOpcode(MI.getOpcode(), MemBytes)) if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 && isFrameOperand(MI, 0, FrameIndex)) return MI.getOperand(X86::AddrNumOperands).getReg(); @@ -4126,7 +4168,8 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameStoreOpcode(MI.getOpcode())) { + unsigned dummy; + if (isFrameStoreOpcode(MI.getOpcode(), dummy)) { unsigned Reg; if ((Reg = isStoreToStackSlot(MI, FrameIndex))) return Reg; Index: test/CodeGen/Generic/pr30821.ll =================================================================== --- /dev/null +++ test/CodeGen/Generic/pr30821.ll @@ -0,0 +1,94 @@ +; RUN: lli -O1 < %s | FileCheck %s + +; CHECK: 23.000000 24.000000 -> 23.000000 0.000000 + +source_filename = "pr30821.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [12 x i8] c"%lf %lf -> \00", align 1 +@.str.1 = private unnamed_addr constant [9 x i8] c"%lf %lf\0A\00", align 1 + +; Function Attrs: norecurse nounwind uwtable +define dso_local i32 @main() local_unnamed_addr #0 { +entry: + %alpha = alloca i8, align 1 + %foxtrot = alloca <2 x double>, align 16 + %india = alloca <2 x double>, align 16 + call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %alpha) + store volatile i8 1, i8* %alpha, align 1 + %alpha.0.alpha.0. = load volatile i8, i8* %alpha, align 1 + %0 = bitcast <2 x double>* %foxtrot to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #4 + %cmp23 = icmp eq i8 %alpha.0.alpha.0., 0 + br i1 %cmp23, label %for.cond.cleanup, label %for.cond2.preheader + +for.cond2.preheader: ; preds = %entry, %for.cond.cleanup5 + %charlie.024 = phi i8 [ %inc8, %for.cond.cleanup5 ], [ 0, %entry ] + br label %for.body6 + +for.cond.cleanup: ; preds = %for.cond.cleanup5, %entry + %india.0.india.0..sroa_cast = bitcast <2 x double>* %india to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %india.0.india.0..sroa_cast) + store volatile <2 x double> , <2 x double>* %india, align 16 + %india.0.india.0. = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecext = extractelement <2 x double> %india.0.india.0., i32 0 + %india.0.india.0.14 = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecext10 = extractelement <2 x double> %india.0.india.0.14, i32 1 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i64 0, i64 0), double %vecext, double %vecext10) + %india.0.india.0.15 = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecins = insertelement <2 x double> %india.0.india.0.15, double 0.000000e+00, i32 1 + store volatile <2 x double> %vecins, <2 x double>* %india, align 16 + %india.0.india.0.16 = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecext11 = extractelement <2 x double> %india.0.india.0.16, i32 0 + %india.0.india.0.17 = load volatile <2 x double>, <2 x double>* %india, align 16 + %vecext12 = extractelement <2 x double> %india.0.india.0.17, i32 1 + %call13 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.1, i64 0, i64 0), double %vecext11, double %vecext12) + call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %india.0.india.0..sroa_cast) + call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #4 + call void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %alpha) + ret i32 0 + +for.cond.cleanup5: ; preds = %for.body6 + %inc8 = add nuw i8 %charlie.024, 1 + %exitcond25 = icmp eq i8 %inc8, %alpha.0.alpha.0. + br i1 %exitcond25, label %for.cond.cleanup, label %for.cond2.preheader + +for.body6: ; preds = %for.body6, %for.cond2.preheader + %golf.022 = phi i8 [ 0, %for.cond2.preheader ], [ %inc, %for.body6 ] + tail call void asm sideeffect "nop", "~{ebx},~{r8},~{r9},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() #4 + call fastcc void @_ZL4inithPvj(i8* nonnull %0) + %inc = add nuw nsw i8 %golf.022, 1 + %exitcond = icmp eq i8 %inc, 2 + br i1 %exitcond, label %for.cond.cleanup5, label %for.body6 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: noinline norecurse nounwind uwtable +define internal fastcc void @_ZL4inithPvj(i8* nocapture %data) unnamed_addr #2 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %0 = trunc i32 %i.01 to i8 + %conv1 = add i8 %0, -16 + %arrayidx = getelementptr inbounds i8, i8* %data, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %inc = add nuw nsw i32 %i.01, 1 + %cmp = icmp eq i64 %indvars.iv.next, 16 + br i1 %cmp, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #3