Skip to content

Commit 9c20156

Browse files
committedMay 3, 2019
[MIR] Add simple PRE pass to MachineCSE
This is the second part of the commit fixing PR38917 (hoisting partitially redundant machine instruction). Most of PRE (partitial redundancy elimination) and CSE work is done on LLVM IR, but some of redundancy arises during DAG legalization. Machine CSE is not enough to deal with it. This simple PRE implementation works a little bit intricately: it passes before CSE, looking for partitial redundancy and transforming it to fully redundancy, anticipating that the next CSE step will eliminate this created redundancy. If CSE doesn't eliminate this, than created instruction will remain dead and eliminated later by Remove Dead Machine Instructions pass. The third part of the commit is supposed to refactor MachineCSE, to make it more clear and to merge MachinePRE with MachineCSE, so one need no rely on further Remove Dead pass to clear instrs not eliminated by CSE. First step: https://reviews.llvm.org/D54839 Fixes llvm.org/PR38917 Reviewers: RKSimon Subscribers: hfinkel, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D56772 llvm-svn: 359870
1 parent 88f9117 commit 9c20156

9 files changed

+1515
-2084
lines changed
 

‎llvm/lib/CodeGen/MachineCSE.cpp

Lines changed: 117 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "llvm/ADT/SmallVector.h"
2020
#include "llvm/ADT/Statistic.h"
2121
#include "llvm/Analysis/AliasAnalysis.h"
22+
#include "llvm/Analysis/CFG.h"
2223
#include "llvm/CodeGen/MachineBasicBlock.h"
2324
#include "llvm/CodeGen/MachineDominators.h"
2425
#include "llvm/CodeGen/MachineFunction.h"
@@ -49,6 +50,8 @@ using namespace llvm;
4950

5051
STATISTIC(NumCoalesces, "Number of copies coalesced");
5152
STATISTIC(NumCSEs, "Number of common subexpression eliminated");
53+
STATISTIC(NumPREs, "Number of partial redundant expression"
54+
" transformed to fully redundant");
5255
STATISTIC(NumPhysCSEs,
5356
"Number of physreg referencing common subexpr eliminated");
5457
STATISTIC(NumCrossBBCSEs,
@@ -84,6 +87,7 @@ namespace {
8487

8588
void releaseMemory() override {
8689
ScopeMap.clear();
90+
PREMap.clear();
8791
Exps.clear();
8892
}
8993

@@ -98,6 +102,8 @@ namespace {
98102

99103
unsigned LookAheadLimit = 0;
100104
DenseMap<MachineBasicBlock *, ScopeType *> ScopeMap;
105+
DenseMap<MachineInstr *, MachineBasicBlock *, MachineInstrExpressionTrait>
106+
PREMap;
101107
ScopedHTType VNT;
102108
SmallVector<MachineInstr *, 64> Exps;
103109
unsigned CurrVN = 0;
@@ -116,13 +122,17 @@ namespace {
116122
PhysDefVector &PhysDefs, bool &NonLocal) const;
117123
bool isCSECandidate(MachineInstr *MI);
118124
bool isProfitableToCSE(unsigned CSReg, unsigned Reg,
119-
MachineInstr *CSMI, MachineInstr *MI);
125+
MachineBasicBlock *CSBB, MachineInstr *MI);
120126
void EnterScope(MachineBasicBlock *MBB);
121127
void ExitScope(MachineBasicBlock *MBB);
122-
bool ProcessBlock(MachineBasicBlock *MBB);
128+
bool ProcessBlockCSE(MachineBasicBlock *MBB);
123129
void ExitScopeIfDone(MachineDomTreeNode *Node,
124130
DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
125131
bool PerformCSE(MachineDomTreeNode *Node);
132+
133+
bool isPRECandidate(MachineInstr *MI);
134+
bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
135+
bool PerformSimplePRE(MachineDominatorTree *DT);
126136
};
127137

128138
} // end anonymous namespace
@@ -405,9 +415,10 @@ bool MachineCSE::isCSECandidate(MachineInstr *MI) {
405415
}
406416

407417
/// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
408-
/// common expression that defines Reg.
418+
/// common expression that defines Reg. CSBB is basic block where CSReg is
419+
/// defined.
409420
bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
410-
MachineInstr *CSMI, MachineInstr *MI) {
421+
MachineBasicBlock *CSBB, MachineInstr *MI) {
411422
// FIXME: Heuristics that works around the lack the live range splitting.
412423

413424
// If CSReg is used at all uses of Reg, CSE should not increase register
@@ -433,7 +444,6 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
433444
// an immediate predecessor. We don't want to increase register pressure and
434445
// end up causing other computation to be spilled.
435446
if (TII->isAsCheapAsAMove(*MI)) {
436-
MachineBasicBlock *CSBB = CSMI->getParent();
437447
MachineBasicBlock *BB = MI->getParent();
438448
if (CSBB != BB && !CSBB->isSuccessor(BB))
439449
return false;
@@ -488,7 +498,7 @@ void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
488498
ScopeMap.erase(SI);
489499
}
490500

491-
bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
501+
bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
492502
bool Changed = false;
493503

494504
SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
@@ -598,7 +608,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
598608
TargetRegisterInfo::isVirtualRegister(NewReg) &&
599609
"Do not CSE physical register defs!");
600610

601-
if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
611+
if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), MI)) {
602612
LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
603613
DoCSE = false;
604614
break;
@@ -738,14 +748,109 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
738748
for (MachineDomTreeNode *Node : Scopes) {
739749
MachineBasicBlock *MBB = Node->getBlock();
740750
EnterScope(MBB);
741-
Changed |= ProcessBlock(MBB);
751+
Changed |= ProcessBlockCSE(MBB);
742752
// If it's a leaf node, it's done. Traverse upwards to pop ancestors.
743753
ExitScopeIfDone(Node, OpenChildren);
744754
}
745755

746756
return Changed;
747757
}
748758

759+
// We use stronger checks for PRE candidate rather than for CSE ones to embrace
760+
// checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps
761+
// to exclude instrs created by PRE that won't be CSEed later.
762+
bool MachineCSE::isPRECandidate(MachineInstr *MI) {
763+
if (!isCSECandidate(MI) ||
764+
MI->isNotDuplicable() ||
765+
MI->isAsCheapAsAMove() ||
766+
MI->getNumDefs() != 1 ||
767+
MI->getNumExplicitDefs() != 1)
768+
return false;
769+
770+
for (auto def : MI->defs())
771+
if (!TRI->isVirtualRegister(def.getReg()))
772+
return false;
773+
774+
for (auto use : MI->uses())
775+
if (use.isReg() && !TRI->isVirtualRegister(use.getReg()))
776+
return false;
777+
778+
return true;
779+
}
780+
781+
bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
782+
MachineBasicBlock *MBB) {
783+
bool Changed = false;
784+
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
785+
MachineInstr *MI = &*I;
786+
++I;
787+
788+
if (!isPRECandidate(MI))
789+
continue;
790+
791+
if (!PREMap.count(MI)) {
792+
PREMap[MI] = MBB;
793+
continue;
794+
}
795+
796+
auto MBB1 = PREMap[MI];
797+
assert(
798+
!DT->properlyDominates(MBB, MBB1) &&
799+
"MBB cannot properly dominate MBB1 while DFS through dominators tree!");
800+
auto CMBB = DT->findNearestCommonDominator(MBB, MBB1);
801+
802+
// Two instrs are partial redundant if their basic blocks are reachable
803+
// from one to another but one doesn't dominate another.
804+
if (CMBB != MBB1) {
805+
auto BB = MBB->getBasicBlock(), BB1 = MBB1->getBasicBlock();
806+
if (BB != nullptr && BB1 != nullptr &&
807+
(isPotentiallyReachable(BB1, BB) ||
808+
isPotentiallyReachable(BB, BB1))) {
809+
810+
assert(MI->getOperand(0).isDef() &&
811+
"First operand of instr with one explicit def must be this def");
812+
unsigned VReg = MI->getOperand(0).getReg();
813+
unsigned NewReg = MRI->cloneVirtualRegister(VReg);
814+
if (!isProfitableToCSE(NewReg, VReg, CMBB, MI))
815+
continue;
816+
MachineInstr &NewMI =
817+
TII->duplicate(*CMBB, CMBB->getFirstTerminator(), *MI);
818+
NewMI.getOperand(0).setReg(NewReg);
819+
820+
PREMap[MI] = CMBB;
821+
++NumPREs;
822+
Changed = true;
823+
}
824+
}
825+
}
826+
return Changed;
827+
}
828+
829+
// This simple PRE (partial redundancy elimination) pass doesn't actually
830+
// eliminate partial redundancy but transforms it to full redundancy,
831+
// anticipating that the next CSE step will eliminate this created redundancy.
832+
// If CSE doesn't eliminate this, than created instruction will remain dead
833+
// and eliminated later by Remove Dead Machine Instructions pass.
834+
bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
835+
SmallVector<MachineDomTreeNode *, 32> BBs;
836+
837+
PREMap.clear();
838+
bool Changed = false;
839+
BBs.push_back(DT->getRootNode());
840+
do {
841+
auto Node = BBs.pop_back_val();
842+
const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
843+
for (MachineDomTreeNode *Child : Children)
844+
BBs.push_back(Child);
845+
846+
MachineBasicBlock *MBB = Node->getBlock();
847+
Changed |= ProcessBlockPRE(DT, MBB);
848+
849+
} while (!BBs.empty());
850+
851+
return Changed;
852+
}
853+
749854
bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
750855
if (skipFunction(MF.getFunction()))
751856
return false;
@@ -756,5 +861,8 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
756861
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
757862
DT = &getAnalysis<MachineDominatorTree>();
758863
LookAheadLimit = TII->getMachineCSELookAheadLimit();
759-
return PerformCSE(DT->getRootNode());
864+
bool ChangedPRE, ChangedCSE;
865+
ChangedPRE = PerformSimplePRE(DT);
866+
ChangedCSE = PerformCSE(DT->getRootNode());
867+
return ChangedPRE || ChangedCSE;
760868
}

‎llvm/test/CodeGen/Mips/internalfunc.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,7 @@ if.then: ; preds = %entry
2727
if.end: ; preds = %entry, %if.then
2828
; CHECK: lw $[[R2:[0-9]+]], %got(sf2)
2929
; CHECK: addiu ${{[0-9]+}}, $[[R2]], %lo(sf2)
30-
; CHECK: lw $[[R3:[0-9]+]], %got(caller.sf1)
31-
; CHECK: sw ${{[0-9]+}}, %lo(caller.sf1)($[[R3]])
30+
; CHECK: sw ${{[0-9]+}}, %lo(caller.sf1)($[[R1]])
3231
%tobool3 = icmp ne i32 %a0, 0
3332
%tmp4 = load void (...)*, void (...)** @gf1, align 4
3433
%cond = select i1 %tobool3, void (...)* %tmp4, void (...)* bitcast (void ()* @sf2 to void (...)*)

‎llvm/test/CodeGen/X86/avx2-masked-gather.ll

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -236,18 +236,17 @@ define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i3
236236
; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
237237
; NOGATHER-NEXT: .LBB4_4: # %else2
238238
; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
239+
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
239240
; NOGATHER-NEXT: testb $1, %al
240241
; NOGATHER-NEXT: je .LBB4_6
241242
; NOGATHER-NEXT: # %bb.5: # %cond.load4
242-
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm3
243-
; NOGATHER-NEXT: vmovq %xmm3, %rax
243+
; NOGATHER-NEXT: vmovq %xmm0, %rax
244244
; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2
245245
; NOGATHER-NEXT: .LBB4_6: # %else5
246246
; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
247247
; NOGATHER-NEXT: testb $1, %al
248248
; NOGATHER-NEXT: je .LBB4_8
249249
; NOGATHER-NEXT: # %bb.7: # %cond.load7
250-
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
251250
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
252251
; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm2
253252
; NOGATHER-NEXT: .LBB4_8: # %else8
@@ -295,18 +294,17 @@ define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <
295294
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
296295
; NOGATHER-NEXT: .LBB5_4: # %else2
297296
; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
297+
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
298298
; NOGATHER-NEXT: testb $1, %al
299299
; NOGATHER-NEXT: je .LBB5_6
300300
; NOGATHER-NEXT: # %bb.5: # %cond.load4
301-
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm3
302-
; NOGATHER-NEXT: vmovq %xmm3, %rax
301+
; NOGATHER-NEXT: vmovq %xmm0, %rax
303302
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
304303
; NOGATHER-NEXT: .LBB5_6: # %else5
305304
; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
306305
; NOGATHER-NEXT: testb $1, %al
307306
; NOGATHER-NEXT: je .LBB5_8
308307
; NOGATHER-NEXT: # %bb.7: # %cond.load7
309-
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
310308
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
311309
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
312310
; NOGATHER-NEXT: .LBB5_8: # %else8
@@ -366,19 +364,18 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
366364
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
367365
; NOGATHER-NEXT: .LBB6_4: # %else2
368366
; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
367+
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
369368
; NOGATHER-NEXT: testb $1, %al
370369
; NOGATHER-NEXT: je .LBB6_6
371370
; NOGATHER-NEXT: # %bb.5: # %cond.load4
372-
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
373-
; NOGATHER-NEXT: vmovq %xmm4, %rax
371+
; NOGATHER-NEXT: vmovq %xmm3, %rax
374372
; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm1, %xmm4
375373
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
376374
; NOGATHER-NEXT: .LBB6_6: # %else5
377375
; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax
378376
; NOGATHER-NEXT: testb $1, %al
379377
; NOGATHER-NEXT: je .LBB6_8
380378
; NOGATHER-NEXT: # %bb.7: # %cond.load7
381-
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
382379
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
383380
; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm3
384381
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
@@ -402,11 +399,11 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
402399
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
403400
; NOGATHER-NEXT: .LBB6_12: # %else14
404401
; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
402+
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
405403
; NOGATHER-NEXT: testb $1, %al
406404
; NOGATHER-NEXT: je .LBB6_14
407405
; NOGATHER-NEXT: # %bb.13: # %cond.load16
408-
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
409-
; NOGATHER-NEXT: vmovq %xmm3, %rax
406+
; NOGATHER-NEXT: vmovq %xmm2, %rax
410407
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
411408
; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
412409
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@@ -415,8 +412,7 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
415412
; NOGATHER-NEXT: testb $1, %al
416413
; NOGATHER-NEXT: je .LBB6_16
417414
; NOGATHER-NEXT: # %bb.15: # %cond.load19
418-
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
419-
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
415+
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
420416
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
421417
; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm0, %xmm0
422418
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
@@ -477,19 +473,18 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
477473
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
478474
; NOGATHER-NEXT: .LBB7_4: # %else2
479475
; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
476+
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
480477
; NOGATHER-NEXT: testb $1, %al
481478
; NOGATHER-NEXT: je .LBB7_6
482479
; NOGATHER-NEXT: # %bb.5: # %cond.load4
483-
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
484-
; NOGATHER-NEXT: vmovq %xmm4, %rax
480+
; NOGATHER-NEXT: vmovq %xmm3, %rax
485481
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0,1],mem[0],xmm1[3]
486482
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
487483
; NOGATHER-NEXT: .LBB7_6: # %else5
488484
; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax
489485
; NOGATHER-NEXT: testb $1, %al
490486
; NOGATHER-NEXT: je .LBB7_8
491487
; NOGATHER-NEXT: # %bb.7: # %cond.load7
492-
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
493488
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
494489
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],mem[0]
495490
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
@@ -514,11 +509,11 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
514509
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
515510
; NOGATHER-NEXT: .LBB7_12: # %else14
516511
; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
512+
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
517513
; NOGATHER-NEXT: testb $1, %al
518514
; NOGATHER-NEXT: je .LBB7_14
519515
; NOGATHER-NEXT: # %bb.13: # %cond.load16
520-
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
521-
; NOGATHER-NEXT: vmovq %xmm3, %rax
516+
; NOGATHER-NEXT: vmovq %xmm2, %rax
522517
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
523518
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
524519
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@@ -527,8 +522,7 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
527522
; NOGATHER-NEXT: testb $1, %al
528523
; NOGATHER-NEXT: je .LBB7_16
529524
; NOGATHER-NEXT: # %bb.15: # %cond.load19
530-
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
531-
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
525+
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
532526
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
533527
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
534528
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
@@ -583,11 +577,11 @@ define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i6
583577
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
584578
; NOGATHER-NEXT: .LBB8_4: # %else2
585579
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
580+
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
586581
; NOGATHER-NEXT: testb $1, %al
587582
; NOGATHER-NEXT: je .LBB8_6
588583
; NOGATHER-NEXT: # %bb.5: # %cond.load4
589-
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
590-
; NOGATHER-NEXT: vmovq %xmm3, %rax
584+
; NOGATHER-NEXT: vmovq %xmm2, %rax
591585
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
592586
; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm3, %xmm3
593587
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@@ -596,8 +590,7 @@ define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i6
596590
; NOGATHER-NEXT: testb $1, %al
597591
; NOGATHER-NEXT: je .LBB8_8
598592
; NOGATHER-NEXT: # %bb.7: # %cond.load7
599-
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
600-
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
593+
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
601594
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
602595
; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
603596
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
@@ -652,11 +645,11 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks
652645
; NOGATHER-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
653646
; NOGATHER-NEXT: .LBB9_4: # %else2
654647
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
648+
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
655649
; NOGATHER-NEXT: testb $1, %al
656650
; NOGATHER-NEXT: je .LBB9_6
657651
; NOGATHER-NEXT: # %bb.5: # %cond.load4
658-
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
659-
; NOGATHER-NEXT: vmovq %xmm3, %rax
652+
; NOGATHER-NEXT: vmovq %xmm2, %rax
660653
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
661654
; NOGATHER-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
662655
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@@ -665,8 +658,7 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks
665658
; NOGATHER-NEXT: testb $1, %al
666659
; NOGATHER-NEXT: je .LBB9_8
667660
; NOGATHER-NEXT: # %bb.7: # %cond.load7
668-
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
669-
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
661+
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
670662
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
671663
; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
672664
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1

‎llvm/test/CodeGen/X86/masked_compressstore.ll

Lines changed: 389 additions & 624 deletions
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/masked_gather.ll

Lines changed: 38 additions & 56 deletions
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/masked_store.ll

Lines changed: 305 additions & 417 deletions
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/masked_store_trunc.ll

Lines changed: 215 additions & 316 deletions
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll

Lines changed: 213 additions & 314 deletions
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/masked_store_trunc_usat.ll

Lines changed: 217 additions & 318 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.