Index: llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h +++ llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h @@ -209,6 +209,14 @@ IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, CCAssignFn *AssignFn) : ValueHandler(true, MIRBuilder, MRI, AssignFn) {} + + /// Insert G_ASSERT_ZEXT/G_ASSERT_ZEXT or other hint instruction based on \p + /// SrcReg, returning the new register if a hint was inserted. + Register buildExtensionHint(CCValAssign &VA, Register SrcReg, LLT NarrowTy); + + /// Provides a default implementation for argument handling. + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign &VA) override; }; struct OutgoingValueHandler : public ValueHandler { Index: llvm/lib/CodeGen/GlobalISel/CallLowering.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -985,3 +985,41 @@ } void CallLowering::ValueHandler::anchor() {} + +Register CallLowering::IncomingValueHandler::buildExtensionHint(CCValAssign &VA, + Register SrcReg, + LLT NarrowTy) { + switch (VA.getLocInfo()) { + case CCValAssign::LocInfo::ZExt: { + return MIRBuilder + .buildAssertZExt(MRI.cloneVirtualRegister(SrcReg), SrcReg, + NarrowTy.getScalarSizeInBits()) + .getReg(0); + } + case CCValAssign::LocInfo::SExt: { + return MIRBuilder + .buildAssertSExt(MRI.cloneVirtualRegister(SrcReg), SrcReg, + NarrowTy.getScalarSizeInBits()) + .getReg(0); + break; + } + default: + return SrcReg; + } +} + +void CallLowering::IncomingValueHandler::assignValueToReg(Register ValVReg, + Register PhysReg, + CCValAssign &VA) { + const LLT LocTy(VA.getLocVT()); + const LLT ValTy = MRI.getType(ValVReg); + + if (ValTy.getSizeInBits() == LocTy.getSizeInBits()) { + MIRBuilder.buildCopy(ValVReg, PhysReg); + return; + } + + auto Copy = MIRBuilder.buildCopy(LocTy, PhysReg); + auto Hint = buildExtensionHint(VA, Copy.getReg(0), ValTy); + MIRBuilder.buildTrunc(ValVReg, Hint); +} Index: llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -70,34 +70,7 @@ void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { markPhysRegUsed(PhysReg); - switch (VA.getLocInfo()) { - default: - MIRBuilder.buildCopy(ValVReg, PhysReg); - break; - case CCValAssign::LocInfo::ZExt: { - auto WideTy = LLT{VA.getLocVT()}; - auto NarrowTy = MRI.getType(ValVReg); - MIRBuilder.buildTrunc(ValVReg, - MIRBuilder.buildAssertZExt( - WideTy, MIRBuilder.buildCopy(WideTy, PhysReg), - NarrowTy.getSizeInBits())); - break; - } - case CCValAssign::LocInfo::SExt: { - auto WideTy = LLT{VA.getLocVT()}; - auto NarrowTy = MRI.getType(ValVReg); - MIRBuilder.buildTrunc(ValVReg, - MIRBuilder.buildAssertSExt( - WideTy, MIRBuilder.buildCopy(WideTy, PhysReg), - NarrowTy.getSizeInBits())); - break; - } - case CCValAssign::LocInfo::AExt: { - auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); - MIRBuilder.buildTrunc(ValVReg, Copy); - break; - } - } + IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA); } void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -29,28 +29,22 @@ namespace { -struct AMDGPUValueHandler : public CallLowering::ValueHandler { - AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B, - MachineRegisterInfo &MRI, CCAssignFn *AssignFn) - : ValueHandler(IsIncoming, B, MRI, AssignFn) {} - - /// Wrapper around extendRegister to ensure we extend to a full 32-bit - /// register. - Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) { - if (VA.getLocVT().getSizeInBits() < 32) { - // 16-bit types are reported as legal for 32-bit registers. We need to - // extend and do a 32-bit copy to avoid the verifier complaining about it. - return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); - } - - return extendRegister(ValVReg, VA); +/// Wrapper around extendRegister to ensure we extend to a full 32-bit register. +static Register extendRegisterMin32(CallLowering::ValueHandler &Handler, + Register ValVReg, CCValAssign &VA) { + if (VA.getLocVT().getSizeInBits() < 32) { + // 16-bit types are reported as legal for 32-bit registers. We need to + // extend and do a 32-bit copy to avoid the verifier complaining about it. + return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); } -}; -struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler { + return Handler.extendRegister(ValVReg, VA); +} + +struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler { AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, MachineInstrBuilder MIB, CCAssignFn *AssignFn) - : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {} + : OutgoingValueHandler(B, MRI, AssignFn), MIB(MIB) {} MachineInstrBuilder MIB; @@ -66,7 +60,7 @@ void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { - Register ExtReg = extendRegisterMin32(ValVReg, VA); + Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); // If this is a scalar return, insert a readfirstlane just in case the value // ends up in a VGPR. @@ -93,12 +87,12 @@ } }; -struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler { +struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler { uint64_t StackUsed = 0; AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, CCAssignFn *AssignFn) - : AMDGPUValueHandler(true, B, MRI, AssignFn) {} + : IncomingValueHandler(B, MRI, AssignFn) {} Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -119,22 +113,16 @@ // 16-bit types are reported as legal for 32-bit registers. We need to do // a 32-bit copy, and truncate to avoid the verifier complaining about it. auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); - MIRBuilder.buildTrunc(ValVReg, Copy); + + // If we have signext/zeroext, it applies to the whole 32-bit register + // before truncation. + auto Extended = + buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT())); + MIRBuilder.buildTrunc(ValVReg, Extended); return; } - switch (VA.getLocInfo()) { - case CCValAssign::LocInfo::SExt: - case CCValAssign::LocInfo::ZExt: - case CCValAssign::LocInfo::AExt: { - auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); - MIRBuilder.buildTrunc(ValVReg, Copy); - break; - } - default: - MIRBuilder.buildCopy(ValVReg, PhysReg); - break; - } + IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA); } void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, @@ -180,8 +168,7 @@ MachineInstrBuilder MIB; }; -struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler { - MachineInstrBuilder MIB; +struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { CCAssignFn *AssignFnVarArg; /// For tail calls, the byte offset of the call's argument area from the @@ -197,7 +184,7 @@ MachineRegisterInfo &MRI, MachineInstrBuilder MIB, CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg, bool IsTailCall = false, int FPDiff = 0) - : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB), + : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB, AssignFn), AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) { } @@ -226,7 +213,7 @@ void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { MIB.addUse(PhysReg, RegState::Implicit); - Register ExtReg = extendRegisterMin32(ValVReg, VA); + Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); MIRBuilder.buildCopy(PhysReg, ExtReg); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -50,7 +50,8 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32) + ; CHECK: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32) ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -70,7 +71,8 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32) + ; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32) ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -139,7 +141,8 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) + ; CHECK: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 8 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_ZEXT]](s32) ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -159,7 +162,8 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) + ; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 8 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_SEXT]](s32) ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -194,7 +198,8 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 16 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_ZEXT]](s32) ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -214,7 +219,8 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 16 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_SEXT]](s32) ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -288,6 +294,36 @@ ret void } +; The signext is an no-op +define void @void_func_i32_signext(i32 signext %arg0) #0 { + ; CHECK-LABEL: name: void_func_i32_signext + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[COPY]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]] + store i32 %arg0, i32 addrspace(1)* undef + ret void +} + +; The zeroext is an no-op +define void @void_func_i32_zeroext(i32 zeroext %arg0) #0 { + ; CHECK-LABEL: name: void_func_i32_zeroext + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[COPY]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]] + store i32 %arg0, i32 addrspace(1)* undef + ret void +} + define void @void_func_p3i8(i8 addrspace(3)* %arg0) #0 { ; CHECK-LABEL: name: void_func_p3i8 ; CHECK: bb.1 (%ir-block.0): Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -85,11 +85,8 @@ ; GFX7-LABEL: v_mul_i16_zeroext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i16_zeroext: