diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -47,6 +47,7 @@ class ConstantInt; class ConstantFP; class APFloat; +class MachineIRBuilder; // Convenience macros for dealing with vector reduction opcodes. #define GISEL_VECREDUCE_CASES_ALL \ @@ -266,6 +267,14 @@ const Register Op2, const MachineRegisterInfo &MRI); +/// Tries to constant fold a vector binop with sources \p Op1 and \p Op2. +/// If successful, returns the G_BUILD_VECTOR representing the folded vector +/// constant. \p MIB should have an insertion point already set to create new +/// G_CONSTANT instructions as needed. +Optional +ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, const Register Op2, + const MachineRegisterInfo &MRI, MachineIRBuilder &MIB); + Optional ConstantFoldExtOp(unsigned Opcode, const Register Op1, uint64_t Imm, const MachineRegisterInfo &MRI); diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/IR/DebugInfoMetadata.h" using namespace llvm; @@ -188,6 +189,14 @@ // Try to constant fold these. assert(SrcOps.size() == 2 && "Invalid sources"); assert(DstOps.size() == 1 && "Invalid dsts"); + if (SrcOps[0].getLLTTy(*getMRI()).isVector()) { + // Try to constant fold vector constants. + auto VecCst = ConstantFoldVectorBinop( + Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI(), *this); + if (VecCst) + return MachineInstrBuilder(getMF(), *VecCst); + break; + } if (Optional Cst = ConstantFoldBinOp(Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI())) return buildConstant(DstOps[0], *Cst); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -17,6 +17,7 @@ #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -583,6 +584,35 @@ return None; } +Optional +llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, + const Register Op2, + const MachineRegisterInfo &MRI, + MachineIRBuilder &MIB) { + auto *SrcVec1 = getOpcodeDef(Op1, MRI); + if (!SrcVec1) + return None; + auto *SrcVec2 = getOpcodeDef(Op2, MRI); + if (!SrcVec2) + return None; + + const LLT EltTy = MRI.getType(SrcVec1->getSourceReg(0)); + + SmallVector FoldedElements; + for (unsigned Idx = 0, E = SrcVec1->getNumSources(); Idx < E; ++Idx) { + auto MaybeCst = ConstantFoldBinOp(Opcode, SrcVec1->getSourceReg(Idx), + SrcVec2->getSourceReg(Idx), MRI); + if (!MaybeCst) + return None; + auto FoldedCstReg = MIB.buildConstant(EltTy, *MaybeCst).getReg(0); + FoldedElements.emplace_back(FoldedCstReg); + } + // Create the new vector constant. + auto CstVec = + MIB.buildBuildVector(MRI.getType(SrcVec1->getReg(0)), FoldedElements); + return &*CstVec; +} + bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, bool SNaN) { const MachineInstr *DefMI = MRI.getVRegDef(Val); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -18,21 +18,14 @@ ; ; GISEL-LABEL: combine_vec_udiv_uniform: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x8, .LCPI0_2 -; GISEL-NEXT: adrp x9, .LCPI0_0 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_2] -; GISEL-NEXT: adrp x8, .LCPI0_1 -; GISEL-NEXT: ldr q4, [x9, :lo12:.LCPI0_0] +; GISEL-NEXT: adrp x8, .LCPI0_0 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] ; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI0_1] ; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h ; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; GISEL-NEXT: sub v2.8h, v4.8h, v3.8h ; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h -; GISEL-NEXT: neg v2.8h, v2.8h -; GISEL-NEXT: ushl v0.8h, v0.8h, v2.8h -; GISEL-NEXT: add v0.8h, v0.8h, v1.8h -; GISEL-NEXT: ushr v0.8h, v0.8h, #4 +; GISEL-NEXT: usra v1.8h, v0.8h, #1 +; GISEL-NEXT: ushr v0.8h, v1.8h, #4 ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, ret <8 x i16> %1 @@ -157,31 +150,24 @@ ; ; GISEL-LABEL: combine_vec_udiv_nonuniform3: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x8, .LCPI3_4 -; GISEL-NEXT: adrp x9, .LCPI3_2 -; GISEL-NEXT: adrp x10, .LCPI3_1 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_4] -; GISEL-NEXT: adrp x8, .LCPI3_5 -; GISEL-NEXT: ldr q2, [x9, :lo12:.LCPI3_2] -; GISEL-NEXT: adrp x9, .LCPI3_3 -; GISEL-NEXT: ldr q3, [x10, :lo12:.LCPI3_1] +; GISEL-NEXT: adrp x8, .LCPI3_2 ; GISEL-NEXT: adrp x10, .LCPI3_0 -; GISEL-NEXT: umull2 v4.4s, v0.8h, v1.8h +; GISEL-NEXT: adrp x9, .LCPI3_1 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_2] +; GISEL-NEXT: adrp x8, .LCPI3_3 +; GISEL-NEXT: ldr q3, [x10, :lo12:.LCPI3_0] +; GISEL-NEXT: ldr q4, [x9, :lo12:.LCPI3_1] +; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h ; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h -; GISEL-NEXT: ldr q6, [x9, :lo12:.LCPI3_3] -; GISEL-NEXT: sub v2.8h, v3.8h, v2.8h -; GISEL-NEXT: ldr q5, [x10, :lo12:.LCPI3_0] -; GISEL-NEXT: uzp2 v1.8h, v1.8h, v4.8h -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI3_5] -; GISEL-NEXT: neg v2.8h, v2.8h -; GISEL-NEXT: sub v3.8h, v0.8h, v1.8h -; GISEL-NEXT: ushl v2.8h, v3.8h, v2.8h -; GISEL-NEXT: cmeq v3.8h, v4.8h, v5.8h -; GISEL-NEXT: neg v4.8h, v6.8h -; GISEL-NEXT: add v1.8h, v2.8h, v1.8h -; GISEL-NEXT: shl v2.8h, v3.8h, #15 -; GISEL-NEXT: ushl v1.8h, v1.8h, v4.8h +; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] +; GISEL-NEXT: cmeq v2.8h, v2.8h, v3.8h +; GISEL-NEXT: sub v5.8h, v0.8h, v1.8h +; GISEL-NEXT: neg v3.8h, v4.8h +; GISEL-NEXT: shl v2.8h, v2.8h, #15 +; GISEL-NEXT: usra v1.8h, v5.8h, #1 ; GISEL-NEXT: sshr v2.8h, v2.8h, #15 +; GISEL-NEXT: ushl v1.8h, v1.8h, v3.8h ; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir @@ -39,12 +39,9 @@ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16) ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16) - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 16 - ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16) - ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(<8 x s16>) = G_SUB [[BUILD_VECTOR3]], [[BUILD_VECTOR2]] - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[SUB1]](<8 x s16>) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[BUILD_VECTOR2]](<8 x s16>) ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]] ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR1]](<8 x s16>) ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>) @@ -226,17 +223,13 @@ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C11]](s16), [[C11]](s16), [[C11]](s16), [[C11]](s16), [[C16]](s16), [[C16]](s16), [[C19]](s16) ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] - ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16) - ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s16) = G_CONSTANT i16 16 - ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16) - ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(<8 x s16>) = G_SUB [[BUILD_VECTOR4]], [[BUILD_VECTOR3]] - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[SUB1]](<8 x s16>) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[BUILD_VECTOR3]](<8 x s16>) ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]] ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>) - ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 - ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR5]] + ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]] ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR1]] ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-umulh-to-lshr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-umulh-to-lshr.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-umulh-to-lshr.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-umulh-to-lshr.mir @@ -37,15 +37,9 @@ ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 29 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32) - ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[BUILD_VECTOR1]], [[BUILD_VECTOR]] - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C2]](s32), [[C2]](s32), [[C2]](s32), [[C2]](s32) - ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(<4 x s32>) = G_SUB [[BUILD_VECTOR2]], [[SUB]] - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[COPY]], [[SUB1]](<4 x s32>) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: $q0 = COPY [[LSHR]](<4 x s32>) %0:_(<4 x s32>) = COPY $q0 %1:_(s32) = G_CONSTANT i32 8 @@ -113,18 +107,12 @@ ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK-NEXT: %cst3:_(s32) = G_CONSTANT i32 32 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 27 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 26 - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32) - ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C4]](s32), [[C4]](s32), [[C4]](s32), [[C4]](s32) - ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[BUILD_VECTOR1]], [[BUILD_VECTOR]] - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR %cst3(s32), %cst3(s32), %cst3(s32), %cst3(s32) - ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(<4 x s32>) = G_SUB [[BUILD_VECTOR2]], [[SUB]] - ; CHECK-NEXT: %mulh:_(<4 x s32>) = G_LSHR [[COPY]], [[SUB1]](<4 x s32>) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 29 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C3]](s32), [[C]](s32), [[C1]](s32), [[C2]](s32) + ; CHECK-NEXT: %mulh:_(<4 x s32>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: $q0 = COPY %mulh(<4 x s32>) %0:_(<4 x s32>) = COPY $q0 %cst1:_(s32) = G_CONSTANT i32 8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir @@ -204,12 +204,9 @@ ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GCN-NEXT: %pow2:_(s32) = G_CONSTANT i32 4096 - ; GCN-NEXT: %pow2_vec:_(<2 x s32>) = G_BUILD_VECTOR %pow2(s32), %pow2(s32) - ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4095 ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) - ; GCN-NEXT: [[ADD:%[0-9]+]]:_(<2 x s32>) = G_ADD %pow2_vec, [[BUILD_VECTOR]] - ; GCN-NEXT: %rem:_(<2 x s32>) = G_AND %var, [[ADD]] + ; GCN-NEXT: %rem:_(<2 x s32>) = G_AND %var, [[BUILD_VECTOR]] ; GCN-NEXT: $vgpr0_vgpr1 = COPY %rem(<2 x s32>) %var:_(<2 x s32>) = COPY $vgpr0_vgpr1 %pow2:_(s32) = G_CONSTANT i32 4096 @@ -229,13 +226,10 @@ ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GCN-NEXT: %pow2_1:_(s32) = G_CONSTANT i32 4096 - ; GCN-NEXT: %pow2_2:_(s32) = G_CONSTANT i32 2048 - ; GCN-NEXT: %pow2_vec:_(<2 x s32>) = G_BUILD_VECTOR %pow2_1(s32), %pow2_2(s32) - ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) - ; GCN-NEXT: [[ADD:%[0-9]+]]:_(<2 x s32>) = G_ADD %pow2_vec, [[BUILD_VECTOR]] - ; GCN-NEXT: %rem:_(<2 x s32>) = G_AND %var, [[ADD]] + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4095 + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32) + ; GCN-NEXT: %rem:_(<2 x s32>) = G_AND %var, [[BUILD_VECTOR]] ; GCN-NEXT: $vgpr0_vgpr1 = COPY %rem(<2 x s32>) %var:_(<2 x s32>) = COPY $vgpr0_vgpr1 %pow2_1:_(s32) = G_CONSTANT i32 4096 @@ -256,12 +250,9 @@ ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; GCN-NEXT: %pow2:_(s64) = G_CONSTANT i64 4096 - ; GCN-NEXT: %pow2_vec:_(<2 x s64>) = G_BUILD_VECTOR %pow2(s64), %pow2(s64) - ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4095 ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) - ; GCN-NEXT: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD %pow2_vec, [[BUILD_VECTOR]] - ; GCN-NEXT: %rem:_(<2 x s64>) = G_AND %var, [[ADD]] + ; GCN-NEXT: %rem:_(<2 x s64>) = G_AND %var, [[BUILD_VECTOR]] ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %rem(<2 x s64>) %var:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %pow2:_(s64) = G_CONSTANT i64 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3439,45 +3439,10 @@ ; GFX9-LABEL: v_fshl_v2i16_4_8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: s_mov_b32 s4, 0x4f7ffffe -; GFX9-NEXT: v_mul_f32_e32 v2, s4, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, s4, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, -16, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, -16, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_hi_u32 v2, 4, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, 4, v2 -; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, 8, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 -; GFX9-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x80004 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x8000c +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3485,44 +3450,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 -; GFX10-NEXT: s_mov_b32 s4, 0x4f7ffffe -; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, s4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, s4, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, -16, v2 -; GFX10-NEXT: v_mul_lo_u32 v5, -16, v3 -; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 -; GFX10-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v5 -; GFX10-NEXT: v_mul_hi_u32 v2, 8, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, 4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 8, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 4, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 16, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 16, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 16, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2 -; GFX10-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 0x80004, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v1, 0x8000c, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> ) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3288,16 +3288,12 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: s_bfe_u32 s5, 3, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX6-NEXT: s_bfe_u32 s5, 8, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s5, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3320,44 +3316,9 @@ ; GFX9-LABEL: v_fshr_v2i16_4_8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: s_mov_b32 s4, 0x4f7ffffe -; GFX9-NEXT: v_mul_f32_e32 v2, s4, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, s4, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, -16, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, -16, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_hi_u32 v2, 4, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, 4, v2 -; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, 8, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 -; GFX9-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x8000c +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x80004 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3366,44 +3327,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 -; GFX10-NEXT: s_mov_b32 s4, 0x4f7ffffe -; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, s4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, s4, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, -16, v2 -; GFX10-NEXT: v_mul_lo_u32 v5, -16, v3 -; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 -; GFX10-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v5 -; GFX10-NEXT: v_mul_hi_u32 v2, 8, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, 4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 8, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 4, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 16, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 16, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 16, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2 -; GFX10-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 0x8000c, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v1, 0x80004, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> ) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll @@ -193,8 +193,9 @@ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C2]](s64), [[C2]](s64) - ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[BUILD_VECTOR2]], [[BUILD_VECTOR3]] - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(<2 x p1>) = G_PTR_ADD [[BUILD_VECTOR]], [[MUL]](<2 x s64>) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C2]](s64), [[C3]](s64) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(<2 x p1>) = G_PTR_ADD [[BUILD_VECTOR]], [[BUILD_VECTOR4]](<2 x s64>) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY9]](<2 x p1>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -976,21 +976,12 @@ } define <2 x i64> @v_udiv_v2i64_pow2k_denom(<2 x i64> %num) { -; GISEL-LABEL: v_udiv_v2i64_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_sub_u32 s4, 63, 11 -; GISEL-NEXT: s_sub_u32 s4, 64, s4 -; GISEL-NEXT: v_lshr_b64 v[0:1], v[0:1], s4 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[2:3], s4 -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_udiv_v2i64_pow2k_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_lshr_b64 v[0:1], v[0:1], 12 -; CGP-NEXT: v_lshr_b64 v[2:3], v[2:3], 12 -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_udiv_v2i64_pow2k_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 12 +; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 12 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i64> %num, ret <2 x i64> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -963,35 +963,15 @@ } define <2 x i64> @v_urem_v2i64_pow2k_denom(<2 x i64> %num) { -; GISEL-LABEL: v_urem_v2i64_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: s_add_u32 s5, s4, -1 -; GISEL-NEXT: s_cselect_b32 s6, 1, 0 -; GISEL-NEXT: s_and_b32 s6, s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: s_addc_u32 s6, 0, -1 -; GISEL-NEXT: s_add_u32 s4, s4, -1 -; GISEL-NEXT: s_cselect_b32 s7, 1, 0 -; GISEL-NEXT: v_and_b32_e32 v0, s5, v0 -; GISEL-NEXT: s_and_b32 s5, s7, 1 -; GISEL-NEXT: v_and_b32_e32 v1, s6, v1 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 -; GISEL-NEXT: s_addc_u32 s5, 0, -1 -; GISEL-NEXT: v_and_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_and_b32_e32 v3, s5, v3 -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_urem_v2i64_pow2k_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0xfff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 -; CGP-NEXT: v_mov_b32_e32 v1, 0 -; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_urem_v2i64_pow2k_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_movk_i32 s4, 0xfff +; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v2, s4, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, ret <2 x i64> %result }