Skip to content

Commit 0330660

Browse files
committedJun 3, 2017
[AMDGPU] Untangle SDWA pass from SIShrinkInstructions
Remove dependency of SDWA pass on SIShrinkInstructions. The goal is to move SDWA even higher in the stack to avoid second run of MachineLICM, MachineCSE and SIFoldOperands. Also added handling to preserve original src modifiers. Differential Revision: https://reviews.llvm.org/D33860 llvm-svn: 304665
1 parent 2fdf185 commit 0330660

24 files changed

+169
-121
lines changed
 

‎llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -734,14 +734,14 @@ void GCNPassConfig::addMachineSSAOptimization() {
734734
addPass(&SIFoldOperandsID);
735735
addPass(&DeadMachineInstructionElimID);
736736
addPass(&SILoadStoreOptimizerID);
737-
addPass(createSIShrinkInstructionsPass());
738737
if (EnableSDWAPeephole) {
739738
addPass(&SIPeepholeSDWAID);
740739
addPass(&MachineLICMID);
741740
addPass(&MachineCSEID);
742741
addPass(&SIFoldOperandsID);
743742
addPass(&DeadMachineInstructionElimID);
744743
}
744+
addPass(createSIShrinkInstructionsPass());
745745
}
746746

747747
bool GCNPassConfig::addILPOpts() {

‎llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

+66-25
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@ class SDWASrcOperand : public SDWAOperand {
129129
bool getNeg() const { return Neg; }
130130
bool getSext() const { return Sext; }
131131

132-
uint64_t getSrcMods() const;
132+
uint64_t getSrcMods(const SIInstrInfo *TII,
133+
const MachineOperand *SrcOp) const;
133134
};
134135

135136
class SDWADstOperand : public SDWAOperand {
@@ -240,13 +241,24 @@ static bool isSubregOf(const MachineOperand &SubReg,
240241
return SuperMask.all();
241242
}
242243

243-
uint64_t SDWASrcOperand::getSrcMods() const {
244+
uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
245+
const MachineOperand *SrcOp) const {
244246
uint64_t Mods = 0;
247+
const auto *MI = SrcOp->getParent();
248+
if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
249+
if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
250+
Mods = Mod->getImm();
251+
}
252+
} else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
253+
if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
254+
Mods = Mod->getImm();
255+
}
256+
}
245257
if (Abs || Neg) {
246258
assert(!Sext &&
247259
"Float and integer src modifiers can't be set simulteniously");
248260
Mods |= Abs ? SISrcMods::ABS : 0;
249-
Mods |= Neg ? SISrcMods::NEG : 0;
261+
Mods ^= Neg ? SISrcMods::NEG : 0;
250262
} else if (Sext) {
251263
Mods |= SISrcMods::SEXT;
252264
}
@@ -312,7 +324,7 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
312324
}
313325
copyRegOperand(*Src, *getTargetOperand());
314326
SrcSel->setImm(getSrcSel());
315-
SrcMods->setImm(getSrcMods());
327+
SrcMods->setImm(getSrcMods(TII, Src));
316328
getTargetOperand()->setIsKill(false);
317329
return true;
318330
}
@@ -409,7 +421,10 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
409421
switch (Opcode) {
410422
case AMDGPU::V_LSHRREV_B32_e32:
411423
case AMDGPU::V_ASHRREV_I32_e32:
412-
case AMDGPU::V_LSHLREV_B32_e32: {
424+
case AMDGPU::V_LSHLREV_B32_e32:
425+
case AMDGPU::V_LSHRREV_B32_e64:
426+
case AMDGPU::V_ASHRREV_I32_e64:
427+
case AMDGPU::V_LSHLREV_B32_e64: {
413428
// from: v_lshrrev_b32_e32 v1, 16/24, v0
414429
// to SDWA src:v0 src_sel:WORD_1/BYTE_3
415430

@@ -432,7 +447,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
432447
TRI->isPhysicalRegister(Dst->getReg()))
433448
break;
434449

435-
if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
450+
if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
451+
Opcode == AMDGPU::V_LSHLREV_B32_e64) {
436452
auto SDWADst = make_unique<SDWADstOperand>(
437453
Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
438454
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
@@ -441,7 +457,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
441457
} else {
442458
auto SDWASrc = make_unique<SDWASrcOperand>(
443459
Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
444-
Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
460+
Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
461+
Opcode != AMDGPU::V_LSHRREV_B32_e64);
445462
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
446463
SDWAOperands[&MI] = std::move(SDWASrc);
447464
++NumSDWAPatternsFound;
@@ -451,7 +468,10 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
451468

452469
case AMDGPU::V_LSHRREV_B16_e32:
453470
case AMDGPU::V_ASHRREV_I16_e32:
454-
case AMDGPU::V_LSHLREV_B16_e32: {
471+
case AMDGPU::V_LSHLREV_B16_e32:
472+
case AMDGPU::V_LSHRREV_B16_e64:
473+
case AMDGPU::V_ASHRREV_I16_e64:
474+
case AMDGPU::V_LSHLREV_B16_e64: {
455475
// from: v_lshrrev_b16_e32 v1, 8, v0
456476
// to SDWA src:v0 src_sel:BYTE_1
457477

@@ -472,7 +492,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
472492
TRI->isPhysicalRegister(Dst->getReg()))
473493
break;
474494

475-
if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
495+
if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
496+
Opcode == AMDGPU::V_LSHLREV_B16_e64) {
476497
auto SDWADst =
477498
make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
478499
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
@@ -481,7 +502,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
481502
} else {
482503
auto SDWASrc = make_unique<SDWASrcOperand>(
483504
Src1, Dst, BYTE_1, false, false,
484-
Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
505+
Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
506+
Opcode != AMDGPU::V_LSHRREV_B16_e64);
485507
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
486508
SDWAOperands[&MI] = std::move(SDWASrc);
487509
++NumSDWAPatternsFound;
@@ -549,28 +571,33 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
549571
++NumSDWAPatternsFound;
550572
break;
551573
}
552-
case AMDGPU::V_AND_B32_e32: {
574+
case AMDGPU::V_AND_B32_e32:
575+
case AMDGPU::V_AND_B32_e64: {
553576
// e.g.:
554577
// from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
555578
// to SDWA src:v0 src_sel:WORD_0/BYTE_0
556579

557580
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
581+
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
582+
auto ValSrc = Src1;
558583
auto Imm = foldToImm(*Src0);
559-
if (!Imm)
560-
break;
561584

562-
if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
585+
if (!Imm) {
586+
Imm = foldToImm(*Src1);
587+
ValSrc = Src0;
588+
}
589+
590+
if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
563591
break;
564592

565-
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
566593
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
567594

568595
if (TRI->isPhysicalRegister(Src1->getReg()) ||
569596
TRI->isPhysicalRegister(Dst->getReg()))
570597
break;
571598

572599
auto SDWASrc = make_unique<SDWASrcOperand>(
573-
Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
600+
ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
574601
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
575602
SDWAOperands[&MI] = std::move(SDWASrc);
576603
++NumSDWAPatternsFound;
@@ -583,28 +610,38 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
583610

584611
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
585612
// Check if this instruction has opcode that supports SDWA
586-
return AMDGPU::getSDWAOp(MI.getOpcode()) != -1;
613+
unsigned Opc = MI.getOpcode();
614+
if (AMDGPU::getSDWAOp(Opc) != -1)
615+
return true;
616+
int Opc32 = AMDGPU::getVOPe32(Opc);
617+
if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1)
618+
return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
619+
!TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
620+
return false;
587621
}
588622

589623
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
590624
const SDWAOperandsVector &SDWAOperands) {
591625
// Convert to sdwa
592626
int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
627+
if (SDWAOpcode == -1)
628+
SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode()));
593629
assert(SDWAOpcode != -1);
594630

631+
// Copy dst, if it is present in original then should also be present in SDWA
632+
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
633+
if (!Dst && !TII->isVOPC(MI))
634+
return false;
635+
595636
const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
596637

597638
// Create SDWA version of instruction MI and initialize its operands
598639
MachineInstrBuilder SDWAInst =
599640
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
600641

601-
// Copy dst, if it is present in original then should also be present in SDWA
602-
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
603642
if (Dst) {
604643
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
605644
SDWAInst.add(*Dst);
606-
} else {
607-
assert(TII->isVOPC(MI));
608645
}
609646

610647
// Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -614,7 +651,10 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
614651
Src0 &&
615652
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
616653
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
617-
SDWAInst.addImm(0);
654+
if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
655+
SDWAInst.addImm(Mod->getImm());
656+
else
657+
SDWAInst.addImm(0);
618658
SDWAInst.add(*Src0);
619659

620660
// Copy src1 if present, initialize src1_modifiers.
@@ -623,10 +663,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
623663
assert(
624664
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
625665
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
626-
SDWAInst.addImm(0);
666+
if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
667+
SDWAInst.addImm(Mod->getImm());
668+
else
669+
SDWAInst.addImm(0);
627670
SDWAInst.add(*Src1);
628-
} else {
629-
assert(TII->isVOP1(MI));
630671
}
631672

632673
if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||

‎llvm/test/CodeGen/AMDGPU/add.v2i16.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out
6666

6767
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
6868
; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
69-
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
69+
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
7070
define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
7171
%tid = call i32 @llvm.amdgcn.workitem.id.x()
7272
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -84,7 +84,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou
8484

8585
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
8686
; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21
87-
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
87+
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8888
define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
8989
%tid = call i32 @llvm.amdgcn.workitem.id.x()
9090
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -101,7 +101,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)*
101101
; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
102102
; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
103103
; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
104-
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
104+
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
105105
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
106106
; VI: v_or_b32_e32
107107
define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
@@ -140,7 +140,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
140140

141141
; VI-NOT: v_add_u16
142142
; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80
143-
; VI: v_add_u16_sdwa v{{[0-9]+}}, v[[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
143+
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
144144
; VI-NOT: v_add_u16
145145
; VI: v_or_b32_e32
146146
define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {

‎llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
1010

1111
; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12-
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
12+
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1313

1414
; CI: v_ashrrev_i32_e32
1515
; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}

‎llvm/test/CodeGen/AMDGPU/fabs.f16.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
4040
; VI: flat_load_ushort [[LO:v[0-9]+]]
4141
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
4242
; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
43-
; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[MASK]], [[LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
43+
; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4444
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]]
4545
; VI: flat_store_dword
4646

@@ -60,8 +60,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
6060
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
6161

6262
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
63-
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
64-
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
63+
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
64+
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6565
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
6666
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
6767
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -128,7 +128,7 @@ define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i
128128
; CI: v_cvt_f16_f32
129129

130130
; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
131-
; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
131+
; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
132132
; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
133133

134134
; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]

‎llvm/test/CodeGen/AMDGPU/fadd.f16.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ entry:
7878
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
7979

8080
; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
81-
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
81+
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8282
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
8383

8484
; GCN: buffer_store_dword v[[R_V2_F16]]
@@ -108,7 +108,7 @@ entry:
108108
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
109109

110110
; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000
111-
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
111+
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
112112
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
113113
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
114114

@@ -137,7 +137,7 @@ entry:
137137
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
138138

139139
; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
140-
; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[CONST1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
140+
; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
141141
; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
142142
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
143143

‎llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -278,9 +278,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa
278278
}
279279

280280
; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
281-
; VI: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}}
282-
; VI-DAG: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
283-
; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
281+
; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00
282+
; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], [[ONE]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
283+
; VI: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
284284
; VI-NOT: v_and_b32
285285

286286
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}

‎llvm/test/CodeGen/AMDGPU/fmul.f16.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ entry:
7878
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
7979

8080
; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
81-
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
81+
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8282
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
8383

8484
; GCN: buffer_store_dword v[[R_V2_F16]]
@@ -105,7 +105,7 @@ entry:
105105
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
106106
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
107107
; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
108-
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
108+
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
109109
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
110110
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
111111
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
@@ -131,7 +131,7 @@ entry:
131131
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
132132
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
133133
; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
134-
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST3]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
134+
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
135135
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
136136
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
137137
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]

0 commit comments

Comments
 (0)
Please sign in to comment.