Index: llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2928,16 +2928,15 @@ return Legalized; } - // This load needs splitting into power of 2 sized loads. if (DstTy.isVector()) return UnableToLegalize; - if (isPowerOf2_32(MemSizeInBits)) - return UnableToLegalize; // Don't know what we're being asked to do. // Big endian lowering not implemented. if (MIRBuilder.getDataLayout().isBigEndian()) return UnableToLegalize; + // This load needs splitting into power of 2 sized loads. + // // Our strategy here is to generate anyextending loads for the smaller // types up to next power-2 result type, and then combine the two larger // result values together, before truncating back down to the non-pow-2 @@ -2950,8 +2949,21 @@ // v1 = i24 trunc v5 // By doing this we generate the correct truncate which should get // combined away as an artifact with a matching extend. - uint64_t LargeSplitSize = PowerOf2Floor(MemSizeInBits); - uint64_t SmallSplitSize = MemSizeInBits - LargeSplitSize; + + uint64_t LargeSplitSize, SmallSplitSize; + + if (!isPowerOf2_32(MemSizeInBits)) { + LargeSplitSize = PowerOf2Floor(MemSizeInBits); + SmallSplitSize = MemSizeInBits - LargeSplitSize; + } else { + // Assume we're being asked to decompose an unaligned load. + // TODO: If this requires multiple splits, handle them all at once. + auto &Ctx = MF.getFunction().getContext(); + if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO)) + return UnableToLegalize; + + SmallSplitSize = LargeSplitSize = MemSizeInBits / 2; + } MachineMemOperand *LargeMMO = MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); @@ -2976,9 +2988,16 @@ if (AnyExtTy == DstTy) MIRBuilder.buildOr(DstReg, Shift, LargeLoad); - else { + else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) { auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); MIRBuilder.buildTrunc(DstReg, {Or}); + } else { + assert(DstTy.isPointer() && "expected pointer"); + auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); + + // FIXME: We currently consider this to be illegal for non-integral address + // spaces, but we need still nede a way to reinterpret the bits. + MIRBuilder.buildIntToPtr(DstReg, Or); } LoadMI.eraseFromParent(); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-global.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-global.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-global.mir @@ -259,8 +259,14 @@ ; GFX8: $vgpr0 = COPY [[SEXTLOAD]](s32) ; GFX6-LABEL: name: test_sextload_global_s32_from_2_align1 ; GFX6: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1) - ; GFX6: $vgpr0 = COPY [[SEXTLOAD]](s32) + ; GFX6: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s8), addrspace 1) + ; GFX6: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX6: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX6: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[PTR_ADD]](p1) :: (load (s8) from unknown-address + 1, addrspace 1) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SEXTLOAD]], [[C1]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX6: $vgpr0 = COPY [[OR]](s32) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = G_SEXTLOAD %0 :: (load (s16), align 1, addrspace 1) $vgpr0 = COPY %1 @@ -279,8 +285,14 @@ ; GFX8: $vgpr0_vgpr1 = COPY [[SEXT]](s64) ; GFX6-LABEL: name: test_sextload_global_s64_from_2_align1 ; GFX6: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1) - ; GFX6: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[SEXTLOAD]](s32) + ; GFX6: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s8), addrspace 1) + ; GFX6: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX6: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX6: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[PTR_ADD]](p1) :: (load (s8) from unknown-address + 1, addrspace 1) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SEXTLOAD]], [[C1]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX6: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[OR]](s32) ; GFX6: $vgpr0_vgpr1 = COPY [[SEXT]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXTLOAD %0 :: (load (s16), align 1, addrspace 1) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-global.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-global.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-global.mir @@ -259,8 +259,14 @@ ; GFX8: $vgpr0 = COPY [[ZEXTLOAD]](s32) ; GFX6-LABEL: name: test_zextload_global_s32_from_2_align1 ; GFX6: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1) - ; GFX6: $vgpr0 = COPY [[ZEXTLOAD]](s32) + ; GFX6: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s8), addrspace 1) + ; GFX6: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX6: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX6: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p1) :: (load (s8) from unknown-address + 1, addrspace 1) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX6: $vgpr0 = COPY [[OR]](s32) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), align 1, addrspace 1) $vgpr0 = COPY %1 @@ -279,8 +285,14 @@ ; GFX8: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) ; GFX6-LABEL: name: test_zextload_global_s64_from_2_align1 ; GFX6: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1) - ; GFX6: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ZEXTLOAD]](s32) + ; GFX6: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p1) :: (load (s8), addrspace 1) + ; GFX6: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX6: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX6: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p1) :: (load (s8) from unknown-address + 1, addrspace 1) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX6: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) ; GFX6: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = G_ZEXTLOAD %0 :: (load (s16), align 1, addrspace 1)