Index: llvm/include/llvm/CodeGen/GlobalISel/Utils.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -270,9 +270,10 @@ /// If successful, returns the G_BUILD_VECTOR representing the folded vector /// constant. \p MIB should have an insertion point already set to create new /// G_CONSTANT instructions as needed. -Optional -ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, const Register Op2, - const MachineRegisterInfo &MRI, MachineIRBuilder &MIB); +Register ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, + const Register Op2, + const MachineRegisterInfo &MRI, + MachineIRBuilder &MIB); Optional ConstantFoldExtOp(unsigned Opcode, const Register Op1, uint64_t Imm, const MachineRegisterInfo &MRI); Index: llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -191,10 +191,10 @@ assert(DstOps.size() == 1 && "Invalid dsts"); if (SrcOps[0].getLLTTy(*getMRI()).isVector()) { // Try to constant fold vector constants. - auto VecCst = ConstantFoldVectorBinop( + Register VecCst = ConstantFoldVectorBinop( Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI(), *this); if (VecCst) - return MachineInstrBuilder(getMF(), *VecCst); + return buildCopy(DstOps[0], VecCst); break; } if (Optional Cst = ConstantFoldBinOp(Opc, SrcOps[0].getReg(), Index: llvm/lib/CodeGen/GlobalISel/Utils.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -592,17 +592,16 @@ return None; } -Optional -llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, - const Register Op2, - const MachineRegisterInfo &MRI, - MachineIRBuilder &MIB) { +Register llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, + const Register Op2, + const MachineRegisterInfo &MRI, + MachineIRBuilder &MIB) { auto *SrcVec1 = getOpcodeDef(Op1, MRI); if (!SrcVec1) - return None; + return Register(); auto *SrcVec2 = getOpcodeDef(Op2, MRI); if (!SrcVec2) - return None; + return Register(); const LLT EltTy = MRI.getType(SrcVec1->getSourceReg(0)); @@ -611,14 +610,14 @@ auto MaybeCst = ConstantFoldBinOp(Opcode, SrcVec1->getSourceReg(Idx), SrcVec2->getSourceReg(Idx), MRI); if (!MaybeCst) - return None; + return Register(); auto FoldedCstReg = MIB.buildConstant(EltTy, *MaybeCst).getReg(0); FoldedElements.emplace_back(FoldedCstReg); } // Create the new vector constant. auto CstVec = MIB.buildBuildVector(MRI.getType(SrcVec1->getReg(0)), FoldedElements); - return &*CstVec; + return CstVec.getReg(0); } bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll @@ -0,0 +1,8 @@ +; RUN: llc -O0 -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s + +define amdgpu_kernel void @test_long_add4() { +entry: + %add = add <4 x i64> zeroinitializer, zeroinitializer + store <4 x i64> %add, <4 x i64> addrspace(1)* null, align 32 + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -O0 -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator -o - %s | FileCheck %s + +; The CSEMIRBuilder was mishandling the result operand when constant +; folding a vector operation, resulting in a missing def for the +; stored value. +define amdgpu_kernel void @constant_fold_vector_add() { + ; CHECK-LABEL: name: constant_fold_vector_add + ; CHECK: bb.1.entry: + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64), [[C]](s64), [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C2]](s64), [[C2]](s64), [[C2]](s64), [[C2]](s64) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s64>) = COPY [[BUILD_VECTOR1]](<4 x s64>) + ; CHECK-NEXT: G_STORE [[COPY]](<4 x s64>), [[C1]](p1) :: (store (<4 x s64>) into `<4 x i64> addrspace(1)* null`, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 +entry: + %add = add <4 x i64> zeroinitializer, zeroinitializer + store <4 x i64> %add, <4 x i64> addrspace(1)* null, align 32 + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll @@ -195,15 +195,16 @@ ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C2]](s64), [[C2]](s64) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C2]](s64), [[C3]](s64) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(<2 x p1>) = G_PTR_ADD [[BUILD_VECTOR]], [[BUILD_VECTOR4]](<2 x s64>) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY9]](<2 x p1>) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s64>) = COPY [[BUILD_VECTOR4]](<2 x s64>) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(<2 x p1>) = G_PTR_ADD [[BUILD_VECTOR]], [[COPY9]](<2 x s64>) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY10]](<2 x p1>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]] - ; CHECK-NEXT: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]] + ; CHECK-NEXT: S_SETPC_B64_return [[COPY11]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %gep = getelementptr i32, <2 x i32 addrspace(1)*> %ptr, <2 x i64> ret <2 x i32 addrspace(1)*> %gep }