Index: include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- include/llvm/CodeGen/TargetInstrInfo.h +++ include/llvm/CodeGen/TargetInstrInfo.h @@ -225,6 +225,17 @@ return 0; } + /// Optional extension of isLoadFromStackSlot that returns the number of + /// bytes loaded from the stack. This must be implemented if a backend + /// supports partial stack slot spills/loads to further disambiguate + /// what the load does. + virtual unsigned isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + MemBytes = 0; + return isLoadFromStackSlot(MI, FrameIndex); + } + /// Check for post-frame ptr elimination stack locations as well. /// This uses a heuristic so it isn't reliable for correctness. virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI, @@ -252,6 +263,17 @@ return 0; } + /// Optional extension of isStoreToStackSlot that returns the number of + /// bytes stored to the stack. This must be implemented if a backend + /// supports partial stack slot spills/loads to further disambiguate + /// what the store does. + virtual unsigned isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + MemBytes = 0; + return isStoreToStackSlot(MI, FrameIndex); + } + /// Check for post-frame ptr elimination stack locations as well. /// This uses a heuristic, so it isn't reliable for correctness. virtual unsigned isStoreToStackSlotPostFE(const MachineInstr &MI, Index: lib/CodeGen/StackSlotColoring.cpp =================================================================== --- lib/CodeGen/StackSlotColoring.cpp +++ lib/CodeGen/StackSlotColoring.cpp @@ -418,7 +418,9 @@ unsigned LoadReg = 0; unsigned StoreReg = 0; - if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS))) + unsigned LoadSize = 0; + unsigned StoreSize = 0; + if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS, LoadSize))) continue; // Skip the ...pseudo debugging... instructions between a load and store. while ((NextMI != E) && NextMI->isDebugValue()) { @@ -426,9 +428,11 @@ ++I; } if (NextMI == E) continue; - if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS))) + if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS, StoreSize))) + continue; + if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1 || + LoadSize != StoreSize) continue; - if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue; ++NumDead; changed = true; Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -238,6 +238,9 @@ unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; + unsigned isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const override; /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't /// reliable for correctness. @@ -246,6 +249,9 @@ unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const override; /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't /// reliable for correctness. Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -3939,24 +3939,40 @@ return false; } -static bool isFrameLoadOpcode(int Opcode) { +static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) { switch (Opcode) { default: return false; case X86::MOV8rm: + case X86::KMOVBkm: + MemBytes = 1; + return true; case X86::MOV16rm: + case X86::KMOVWkm: + MemBytes = 2; + return true; case X86::MOV32rm: + case X86::MOVSSrm: + case X86::VMOVSSZrm: + case X86::KMOVDkm: + MemBytes = 4; + return true; case X86::MOV64rm: case X86::LD_Fp64m: - case X86::MOVSSrm: case X86::MOVSDrm: + case X86::VMOVSSrm: + case X86::VMOVSDZrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::KMOVQkm: + MemBytes = 8; + return true; case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::VMOVSSrm: case X86::VMOVSDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: @@ -3964,131 +3980,142 @@ case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: - case X86::VMOVUPSYrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVUPSZ128rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVAPDZ128rm: + case X86::VMOVUPDZ128rm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQU64Z128rm: + MemBytes = 16; + return true; case X86::VMOVAPSYrm: - case X86::VMOVUPDYrm: + case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: - case X86::VMOVDQUYrm: + case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: - case X86::MMX_MOVD64rm: - case X86::MMX_MOVQ64rm: - case X86::VMOVSSZrm: - case X86::VMOVSDZrm: - case X86::VMOVAPSZrm: - case X86::VMOVAPSZ128rm: + case X86::VMOVDQUYrm: case X86::VMOVAPSZ256rm: - case X86::VMOVAPSZ128rm_NOVLX: - case X86::VMOVAPSZ256rm_NOVLX: - case X86::VMOVUPSZrm: - case X86::VMOVUPSZ128rm: case X86::VMOVUPSZ256rm: - case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVUPSZ256rm_NOVLX: - case X86::VMOVAPDZrm: - case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: - case X86::VMOVUPDZrm: - case X86::VMOVUPDZ128rm: case X86::VMOVUPDZ256rm: - case X86::VMOVDQA32Zrm: - case X86::VMOVDQA32Z128rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU16Z256rm: case X86::VMOVDQA32Z256rm: - case X86::VMOVDQU32Zrm: - case X86::VMOVDQU32Z128rm: case X86::VMOVDQU32Z256rm: - case X86::VMOVDQA64Zrm: - case X86::VMOVDQA64Z128rm: case X86::VMOVDQA64Z256rm: - case X86::VMOVDQU64Zrm: - case X86::VMOVDQU64Z128rm: case X86::VMOVDQU64Z256rm: + MemBytes = 32; + return true; + case X86::VMOVAPSZrm: + case X86::VMOVUPSZrm: + case X86::VMOVAPDZrm: + case X86::VMOVUPDZrm: case X86::VMOVDQU8Zrm: - case X86::VMOVDQU8Z128rm: - case X86::VMOVDQU8Z256rm: case X86::VMOVDQU16Zrm: - case X86::VMOVDQU16Z128rm: - case X86::VMOVDQU16Z256rm: - case X86::KMOVBkm: - case X86::KMOVWkm: - case X86::KMOVDkm: - case X86::KMOVQkm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU64Zrm: + MemBytes = 64; return true; } } -static bool isFrameStoreOpcode(int Opcode) { +static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) { switch (Opcode) { - default: break; + default: + return false; case X86::MOV8mr: + case X86::KMOVBmk: + MemBytes = 1; + return true; case X86::MOV16mr: + case X86::KMOVWmk: + MemBytes = 2; + return true; case X86::MOV32mr: + case X86::MOVSSmr: + case X86::VMOVSSmr: + case X86::VMOVSSZmr: + case X86::KMOVDmk: + MemBytes = 4; + return true; case X86::MOV64mr: case X86::ST_FpP64m: - case X86::MOVSSmr: case X86::MOVSDmr: + case X86::VMOVSDmr: + case X86::VMOVSDZmr: + case X86::MMX_MOVD64mr: + case X86::MMX_MOVQ64mr: + case X86::MMX_MOVNTQmr: + case X86::KMOVQmk: + MemBytes = 8; + return true; case X86::MOVAPSmr: case X86::MOVUPSmr: case X86::MOVAPDmr: case X86::MOVUPDmr: case X86::MOVDQAmr: case X86::MOVDQUmr: - case X86::VMOVSSmr: - case X86::VMOVSDmr: case X86::VMOVAPSmr: case X86::VMOVUPSmr: case X86::VMOVAPDmr: case X86::VMOVUPDmr: case X86::VMOVDQAmr: case X86::VMOVDQUmr: + case X86::VMOVUPSZ128mr: + case X86::VMOVAPSZ128mr: + case X86::VMOVUPSZ128mr_NOVLX: + case X86::VMOVAPSZ128mr_NOVLX: + case X86::VMOVUPDZ128mr: + case X86::VMOVAPDZ128mr: + case X86::VMOVDQA32Z128mr: + case X86::VMOVDQU32Z128mr: + case X86::VMOVDQA64Z128mr: + case X86::VMOVDQU64Z128mr: + case X86::VMOVDQU8Z128mr: + case X86::VMOVDQU16Z128mr: + MemBytes = 16; + return true; case X86::VMOVUPSYmr: case X86::VMOVAPSYmr: case X86::VMOVUPDYmr: case X86::VMOVAPDYmr: case X86::VMOVDQUYmr: case X86::VMOVDQAYmr: - case X86::VMOVSSZmr: - case X86::VMOVSDZmr: - case X86::VMOVUPSZmr: - case X86::VMOVUPSZ128mr: case X86::VMOVUPSZ256mr: - case X86::VMOVUPSZ128mr_NOVLX: - case X86::VMOVUPSZ256mr_NOVLX: - case X86::VMOVAPSZmr: - case X86::VMOVAPSZ128mr: case X86::VMOVAPSZ256mr: - case X86::VMOVAPSZ128mr_NOVLX: + case X86::VMOVUPSZ256mr_NOVLX: case X86::VMOVAPSZ256mr_NOVLX: - case X86::VMOVUPDZmr: - case X86::VMOVUPDZ128mr: case X86::VMOVUPDZ256mr: - case X86::VMOVAPDZmr: - case X86::VMOVAPDZ128mr: case X86::VMOVAPDZ256mr: - case X86::VMOVDQA32Zmr: - case X86::VMOVDQA32Z128mr: + case X86::VMOVDQU8Z256mr: + case X86::VMOVDQU16Z256mr: case X86::VMOVDQA32Z256mr: - case X86::VMOVDQU32Zmr: - case X86::VMOVDQU32Z128mr: case X86::VMOVDQU32Z256mr: - case X86::VMOVDQA64Zmr: - case X86::VMOVDQA64Z128mr: case X86::VMOVDQA64Z256mr: - case X86::VMOVDQU64Zmr: - case X86::VMOVDQU64Z128mr: case X86::VMOVDQU64Z256mr: + MemBytes = 32; + return true; + case X86::VMOVUPSZmr: + case X86::VMOVAPSZmr: + case X86::VMOVUPDZmr: + case X86::VMOVAPDZmr: case X86::VMOVDQU8Zmr: - case X86::VMOVDQU8Z128mr: - case X86::VMOVDQU8Z256mr: case X86::VMOVDQU16Zmr: - case X86::VMOVDQU16Z128mr: - case X86::VMOVDQU16Z256mr: - case X86::MMX_MOVD64mr: - case X86::MMX_MOVQ64mr: - case X86::MMX_MOVNTQmr: - case X86::KMOVBmk: - case X86::KMOVWmk: - case X86::KMOVDmk: - case X86::KMOVQmk: + case X86::VMOVDQA32Zmr: + case X86::VMOVDQU32Zmr: + case X86::VMOVDQA64Zmr: + case X86::VMOVDQU64Zmr: + MemBytes = 64; return true; } return false; @@ -4096,7 +4123,14 @@ unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameLoadOpcode(MI.getOpcode())) + unsigned Dummy; + return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy); +} + +unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + if (isFrameLoadOpcode(MI.getOpcode(), MemBytes)) if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) return MI.getOperand(0).getReg(); return 0; @@ -4104,7 +4138,8 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameLoadOpcode(MI.getOpcode())) { + unsigned Dummy; + if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) { unsigned Reg; if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) return Reg; @@ -4117,7 +4152,14 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameStoreOpcode(MI.getOpcode())) + unsigned Dummy; + return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy); +} + +unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex, + unsigned &MemBytes) const { + if (isFrameStoreOpcode(MI.getOpcode(), MemBytes)) if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 && isFrameOperand(MI, 0, FrameIndex)) return MI.getOperand(X86::AddrNumOperands).getReg(); @@ -4126,7 +4168,8 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameStoreOpcode(MI.getOpcode())) { + unsigned Dummy; + if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) { unsigned Reg; if ((Reg = isStoreToStackSlot(MI, FrameIndex))) return Reg; Index: test/CodeGen/X86/pr30821.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/pr30821.mir @@ -0,0 +1,133 @@ +# RUN: llc -x mir < %s -run-pass=greedy,virtregrewriter,stack-slot-coloring | FileCheck %s +--- | + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-linux-gnu" + + define dso_local i32 @main() local_unnamed_addr { + entry: + ; Dummy IR that just performs some allocas -- the machine IR function + ; below is what this test is about. + %alpha = alloca i8, align 1 + %foxtrot = alloca <2 x double>, align 16 + %india = alloca <2 x double>, align 16 + ret i32 0 + } + +... +--- +name: main +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +registers: +liveins: +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 16 + adjustsStack: false + hasCalls: true + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: +stack: + - { id: 0, name: alpha, type: default, offset: 0, size: 1, alignment: 1, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 1, name: foxtrot, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 2, name: india, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } +constants: +body: | + bb.0.entry: + ; To trick stack-slot-colouring to run its dead-store-elimination phase, + ; which is at fault, we need the register allocator to run, and spill in two + ; places that can have their slots merged. Achieve this by volatile-loading + ; data into $xmm[0-14] and volatile storing them later, leaving regalloc only + ; $xmm15 to play with in the middle. + ; Then, perform two virtreg load-and-store pairs, with the faulty code + ; sequence in the middle (MOVSDrm then MOVAPDmr on the same slot). The virtreg + ; gets spilt; the corresponding stack slots merged; and faulty code sequence + ; eliminated if LLVM is broken. + + ; Make first 15 $xmm registers live + $xmm0 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm1 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm2 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm3 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm4 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm5 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm6 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm7 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm8 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm9 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm10 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm11 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm12 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm13 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + $xmm14 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + + ; First vreg load + %1:vr128 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + + ; First faulty sequence; %1 spilt + %12:fr64 = MOVSDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 8 from %ir.india) + %13:vr128 = COPY killed %12 + MOVAPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed %13 :: (volatile store 16 into %ir.india) + ; CHECK: renamable $xmm{{[0-9]+}} = MOVSDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 8 from %ir.india) + ; CHECK-NEXT: MOVAPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed renamable $xmm{{[0-9]+}} :: (volatile store 16 into %ir.india) + + ; Store %1 to avoid it being optimised out, will result in a load-from-spill + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed %1 :: (volatile dereferenceable store 16 into %ir.india) + + ; That code sequence a second time, to generate a second spill slot that + ; will get coloured and merged. + %2:vr128 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + + %22:fr64 = MOVSDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 8 from %ir.india) + %23:vr128 = COPY killed %22 + MOVAPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed %23 :: (volatile store 16 into %ir.india) + + ; CHECK: renamable $xmm{{[0-9]+}} = MOVSDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 8 from %ir.india) + ; CHECK-NEXT: MOVAPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed renamable $xmm{{[0-9]+}} :: (volatile store 16 into %ir.india) + + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed %2 :: (volatile dereferenceable store 16 into %ir.india) + + ; Stores of first 15 $xmm registers to keep them live across the middle of + ; this bb. + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm0 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm1 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm2 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm3 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm4 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm5 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm6 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm7 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm8 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm9 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm10 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm11 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm12 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm13 :: (volatile dereferenceable store 16 into %ir.india) + MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm14 :: (volatile dereferenceable store 16 into %ir.india) + + RET 0 + +...