Index: llvm/lib/CodeGen/AtomicExpandPass.cpp =================================================================== --- llvm/lib/CodeGen/AtomicExpandPass.cpp +++ llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -577,10 +577,6 @@ unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; unsigned ValueSize = getAtomicOpSize(AI); if (ValueSize < MinCASSize) { - // TODO: Handle atomicrmw fadd/fsub - if (AI->getType()->isFloatingPointTy()) - return false; - expandPartwordAtomicRMW(AI, TargetLoweringBase::AtomicExpansionKind::CmpXChg); } else { @@ -621,6 +617,7 @@ // These three fields are guaranteed to be set by createMaskInstrs. Type *WordType = nullptr; Type *ValueType = nullptr; + Type *IntValueType = nullptr; Value *AlignedAddr = nullptr; Align AlignedAddrAlignment; // The remaining fields can be null. @@ -685,7 +682,11 @@ const DataLayout &DL = M->getDataLayout(); unsigned ValueSize = DL.getTypeStoreSize(ValueType); - PMV.ValueType = ValueType; + PMV.ValueType = PMV.IntValueType = ValueType; + if (PMV.ValueType->isFloatingPointTy()) + PMV.IntValueType = + Type::getIntNTy(Ctx, ValueType->getPrimitiveSizeInBits()); + PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(Ctx, MinWordSize * 8) : ValueType; if (PMV.ValueType == PMV.WordType) { @@ -749,8 +750,8 @@ return WideWord; Value *Shift = Builder.CreateLShr(WideWord, PMV.ShiftAmt, "shifted"); - Value *Trunc = Builder.CreateTrunc(Shift, PMV.ValueType, "extracted"); - return Trunc; + Value *Trunc = Builder.CreateTrunc(Shift, PMV.IntValueType, "extracted"); + return Builder.CreateBitCast(Trunc, PMV.ValueType); } static Value *insertMaskedValue(IRBuilderBase &Builder, Value *WideWord, @@ -760,6 +761,8 @@ if (PMV.WordType == PMV.ValueType) return Updated; + Updated = Builder.CreateBitCast(Updated, PMV.IntValueType); + Value *ZExt = Builder.CreateZExt(Updated, PMV.WordType, "extended"); Value *Shift = Builder.CreateShl(ZExt, PMV.ShiftAmt, "shifted", /*HasNUW*/ true); @@ -801,10 +804,14 @@ case AtomicRMWInst::Max: case AtomicRMWInst::Min: case AtomicRMWInst::UMax: - case AtomicRMWInst::UMin: { - // Finally, comparison ops will operate on the full value, so - // truncate down to the original size, and expand out again after - // doing the operation. + case AtomicRMWInst::UMin: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: + case AtomicRMWInst::FMin: + case AtomicRMWInst::FMax: { + // Finally, other ops will operate on the full value, so truncate down to + // the original size, and expand out again after doing the + // operation. Bitcasts will be inserted for FP values. Value *Loaded_Extract = extractMaskedValue(Builder, Loaded, PMV); Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded_Extract, Inc); Value *FinalVal = insertMaskedValue(Builder, Loaded, NewVal, PMV); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12753,10 +12753,8 @@ case AtomicRMWInst::FAdd: { Type *Ty = RMW->getType(); - // We don't have a way to support 16-bit atomics now, so just leave them - // as-is. if (Ty->isHalfTy()) - return AtomicExpansionKind::None; + return AtomicExpansionKind::CmpXChg; if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy())) return AtomicExpansionKind::CmpXChg; Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -844,28 +844,196 @@ define half @test_atomicrmw_fadd_f16_flat(half* %ptr, half %value) { ; CI-LABEL: @test_atomicrmw_fadd_f16_flat( -; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; CI-NEXT: ret half [[RES]] +; CI-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; CI-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CI-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; CI-NEXT: [[TMP3:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; CI-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; CI-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; CI-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; CI-NEXT: [[TMP6:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CI-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; CI-NEXT: ret half [[TMP7]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX9-NEXT: ret half [[RES]] +; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX9-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; GFX9-NEXT: [[TMP3:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX9-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX9-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX9-NEXT: ret half [[TMP7]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX908-NEXT: ret half [[RES]] +; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX908-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; GFX908-NEXT: [[TMP3:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX908-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX908-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX908-NEXT: ret half [[TMP7]] ; ; GFX90A-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX90A-NEXT: ret half [[RES]] +; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX90A-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; GFX90A-NEXT: [[TMP3:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX90A-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX90A-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX90A-NEXT: ret half [[TMP7]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX940-NEXT: ret half [[RES]] +; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX940-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; GFX940-NEXT: [[TMP3:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX940-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX940-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX940-NEXT: ret half [[TMP7]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX11-NEXT: ret half [[RES]] +; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX11-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; GFX11-NEXT: [[TMP3:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX11-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX11-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX11-NEXT: ret half [[TMP7]] ; %res = atomicrmw fadd half* %ptr, half %value seq_cst ret half %res @@ -873,28 +1041,196 @@ define half @test_atomicrmw_fadd_f16_global(half addrspace(1)* %ptr, half %value) { ; CI-LABEL: @test_atomicrmw_fadd_f16_global( -; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; CI-NEXT: ret half [[RES]] +; CI-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(1)* @llvm.ptrmask.p1f16.i64(half addrspace(1)* [[PTR:%.*]], i64 -4) +; CI-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(1)* [[PTR]] to i64 +; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CI-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(1)* [[ALIGNEDADDR]] to i32 addrspace(1)* +; CI-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR1]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; CI-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; CI-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; CI-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; CI-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CI-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; CI-NEXT: ret half [[TMP7]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX9-NEXT: ret half [[RES]] +; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(1)* @llvm.ptrmask.p1f16.i64(half addrspace(1)* [[PTR:%.*]], i64 -4) +; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(1)* [[PTR]] to i64 +; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX9-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(1)* [[ALIGNEDADDR]] to i32 addrspace(1)* +; GFX9-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR1]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX9-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX9-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX9-NEXT: ret half [[TMP7]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX908-NEXT: ret half [[RES]] +; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(1)* @llvm.ptrmask.p1f16.i64(half addrspace(1)* [[PTR:%.*]], i64 -4) +; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(1)* [[PTR]] to i64 +; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX908-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(1)* [[ALIGNEDADDR]] to i32 addrspace(1)* +; GFX908-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR1]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX908-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX908-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX908-NEXT: ret half [[TMP7]] ; ; GFX90A-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX90A-NEXT: ret half [[RES]] +; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(1)* @llvm.ptrmask.p1f16.i64(half addrspace(1)* [[PTR:%.*]], i64 -4) +; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(1)* [[PTR]] to i64 +; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX90A-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(1)* [[ALIGNEDADDR]] to i32 addrspace(1)* +; GFX90A-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR1]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX90A-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX90A-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX90A-NEXT: ret half [[TMP7]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX940-NEXT: ret half [[RES]] +; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(1)* @llvm.ptrmask.p1f16.i64(half addrspace(1)* [[PTR:%.*]], i64 -4) +; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(1)* [[PTR]] to i64 +; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX940-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(1)* [[ALIGNEDADDR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR1]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX940-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX940-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX940-NEXT: ret half [[TMP7]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX11-NEXT: ret half [[RES]] +; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(1)* @llvm.ptrmask.p1f16.i64(half addrspace(1)* [[PTR:%.*]], i64 -4) +; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(1)* [[PTR]] to i64 +; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX11-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(1)* [[ALIGNEDADDR]] to i32 addrspace(1)* +; GFX11-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR1]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX11-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX11-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX11-NEXT: ret half [[TMP7]] ; %res = atomicrmw fadd half addrspace(1)* %ptr, half %value seq_cst ret half %res @@ -902,16 +1238,136 @@ define half @test_atomicrmw_fadd_f16_global_align4(half addrspace(1)* %ptr, half %value) { ; CI-LABEL: @test_atomicrmw_fadd_f16_global_align4( -; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 4 -; CI-NEXT: ret half [[RES]] +; CI-NEXT: [[ALIGNEDADDR:%.*]] = bitcast half addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)* +; CI-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; CI-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; CI-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] +; CI-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 +; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; CI-NEXT: [[TMP4:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; CI-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; CI-NEXT: ret half [[TMP5]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_f16_global_align4( -; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 4 -; GFX9-NEXT: ret half [[RES]] +; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = bitcast half addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)* +; GFX9-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; GFX9-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX9-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 +; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GFX9-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; GFX9-NEXT: ret half [[TMP5]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_f16_global_align4( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 4 -; GFX908-NEXT: ret half [[RES]] +; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = bitcast half addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)* +; GFX908-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; GFX908-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX908-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 +; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GFX908-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; GFX908-NEXT: ret half [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f16_global_align4( +; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = bitcast half addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)* +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX90A-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 +; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GFX90A-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; GFX90A-NEXT: ret half [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f16_global_align4( +; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = bitcast half addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; GFX940-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX940-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 +; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GFX940-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; GFX940-NEXT: ret half [[TMP5]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f16_global_align4( +; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = bitcast half addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)* +; GFX11-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; GFX11-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX11-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 +; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GFX11-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; GFX11-NEXT: ret half [[TMP5]] ; %res = atomicrmw fadd half addrspace(1)* %ptr, half %value seq_cst, align 4 ret half %res @@ -919,28 +1375,196 @@ define half @test_atomicrmw_fadd_f16_local(half addrspace(3)* %ptr, half %value) { ; CI-LABEL: @test_atomicrmw_fadd_f16_local( -; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; CI-NEXT: ret half [[RES]] +; CI-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(3)* @llvm.ptrmask.p3f16.i64(half addrspace(3)* [[PTR:%.*]], i64 -4) +; CI-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(3)* [[PTR]] to i64 +; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CI-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(3)* [[ALIGNEDADDR]] to i32 addrspace(3)* +; CI-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(3)* [[ALIGNEDADDR1]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; CI-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; CI-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; CI-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; CI-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(3)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CI-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; CI-NEXT: ret half [[TMP7]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX9-NEXT: ret half [[RES]] +; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(3)* @llvm.ptrmask.p3f16.i64(half addrspace(3)* [[PTR:%.*]], i64 -4) +; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(3)* [[PTR]] to i64 +; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX9-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(3)* [[ALIGNEDADDR]] to i32 addrspace(3)* +; GFX9-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(3)* [[ALIGNEDADDR1]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX9-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX9-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(3)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX9-NEXT: ret half [[TMP7]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX908-NEXT: ret half [[RES]] +; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(3)* @llvm.ptrmask.p3f16.i64(half addrspace(3)* [[PTR:%.*]], i64 -4) +; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(3)* [[PTR]] to i64 +; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX908-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(3)* [[ALIGNEDADDR]] to i32 addrspace(3)* +; GFX908-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(3)* [[ALIGNEDADDR1]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX908-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX908-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(3)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX908-NEXT: ret half [[TMP7]] ; ; GFX90A-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX90A-NEXT: ret half [[RES]] +; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(3)* @llvm.ptrmask.p3f16.i64(half addrspace(3)* [[PTR:%.*]], i64 -4) +; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(3)* [[PTR]] to i64 +; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX90A-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(3)* [[ALIGNEDADDR]] to i32 addrspace(3)* +; GFX90A-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(3)* [[ALIGNEDADDR1]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX90A-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX90A-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(3)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX90A-NEXT: ret half [[TMP7]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX940-NEXT: ret half [[RES]] +; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(3)* @llvm.ptrmask.p3f16.i64(half addrspace(3)* [[PTR:%.*]], i64 -4) +; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(3)* [[PTR]] to i64 +; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX940-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(3)* [[ALIGNEDADDR]] to i32 addrspace(3)* +; GFX940-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(3)* [[ALIGNEDADDR1]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX940-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX940-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(3)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX940-NEXT: ret half [[TMP7]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GFX11-NEXT: ret half [[RES]] +; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(3)* @llvm.ptrmask.p3f16.i64(half addrspace(3)* [[PTR:%.*]], i64 -4) +; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(3)* [[PTR]] to i64 +; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GFX11-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(3)* [[ALIGNEDADDR]] to i32 addrspace(3)* +; GFX11-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(3)* [[ALIGNEDADDR1]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GFX11-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GFX11-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(3)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GFX11-NEXT: ret half [[TMP7]] ; %res = atomicrmw fadd half addrspace(3)* %ptr, half %value seq_cst ret half %res Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll @@ -0,0 +1,273 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -atomic-expand %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s | FileCheck -check-prefix=GCN %s + +define float @test_atomicrmw_fmax_f32_flat(float* %ptr, float %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f32_flat( +; GCN-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast float* [[PTR]] to i32* +; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[TMP2]] to i32 +; GCN-NEXT: [[TMP5:%.*]] = bitcast float [[LOADED]] to i32 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i32* [[TMP3]], i32 [[TMP5]], i32 [[TMP4]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i32 [[NEWLOADED]] to float +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret float [[TMP7]] +; + %res = atomicrmw fmax float* %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global(float addrspace(1)* %ptr, float %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f32_global( +; GCN-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[TMP2]] to i32 +; GCN-NEXT: [[TMP5:%.*]] = bitcast float [[LOADED]] to i32 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(1)* [[TMP3]], i32 [[TMP5]], i32 [[TMP4]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i32 [[NEWLOADED]] to float +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret float [[TMP7]] +; + %res = atomicrmw fmax float addrspace(1)* %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fmax_f32_local(float addrspace(3)* %ptr, float %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f32_local( +; GCN-NEXT: [[TMP1:%.*]] = load float, float addrspace(3)* [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast float addrspace(3)* [[PTR]] to i32 addrspace(3)* +; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[TMP2]] to i32 +; GCN-NEXT: [[TMP5:%.*]] = bitcast float [[LOADED]] to i32 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(3)* [[TMP3]], i32 [[TMP5]], i32 [[TMP4]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i32 [[NEWLOADED]] to float +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret float [[TMP7]] +; + %res = atomicrmw fmax float addrspace(3)* %ptr, float %value seq_cst + ret float %res +} + +define half @test_atomicrmw_fmax_f16_flat(half* %ptr, half %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f16_flat( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; GCN-NEXT: [[TMP3:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[TMP5:%.*]] = call half @llvm.maxnum.f16(half [[TMP4]], half [[VALUE:%.*]]) +; GCN-NEXT: [[TMP6:%.*]] = bitcast half [[TMP5]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32 +; GCN-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GCN-NEXT: [[TMP8:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GCN-NEXT: ret half [[TMP8]] +; + %res = atomicrmw fmax half* %ptr, half %value seq_cst + ret half %res +} + +define half @test_atomicrmw_fmax_f16_global(half addrspace(1)* %ptr, half %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f16_global( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(1)* @llvm.ptrmask.p1f16.i64(half addrspace(1)* [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(1)* [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(1)* [[ALIGNEDADDR]] to i32 addrspace(1)* +; GCN-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR1]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[TMP5:%.*]] = call half @llvm.maxnum.f16(half [[TMP4]], half [[VALUE:%.*]]) +; GCN-NEXT: [[TMP6:%.*]] = bitcast half [[TMP5]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32 +; GCN-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GCN-NEXT: [[TMP8:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GCN-NEXT: ret half [[TMP8]] +; + %res = atomicrmw fmax half addrspace(1)* %ptr, half %value seq_cst + ret half %res +} + +define half @test_atomicrmw_fmax_f16_global_align4(half addrspace(1)* %ptr, half %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f16_global_align4( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = bitcast half addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)* +; GCN-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; GCN-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[TMP3:%.*]] = call half @llvm.maxnum.f16(half [[TMP2]], half [[VALUE:%.*]]) +; GCN-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP4]] to i32 +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: [[TMP6:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; GCN-NEXT: ret half [[TMP6]] +; + %res = atomicrmw fmax half addrspace(1)* %ptr, half %value seq_cst, align 4 + ret half %res +} + +define half @test_atomicrmw_fmax_f16_local(half addrspace(3)* %ptr, half %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f16_local( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(3)* @llvm.ptrmask.p3f16.i64(half addrspace(3)* [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(3)* [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(3)* [[ALIGNEDADDR]] to i32 addrspace(3)* +; GCN-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(3)* [[ALIGNEDADDR1]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[TMP5:%.*]] = call half @llvm.maxnum.f16(half [[TMP4]], half [[VALUE:%.*]]) +; GCN-NEXT: [[TMP6:%.*]] = bitcast half [[TMP5]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32 +; GCN-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg i32 addrspace(3)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GCN-NEXT: [[TMP8:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GCN-NEXT: ret half [[TMP8]] +; + %res = atomicrmw fmax half addrspace(3)* %ptr, half %value seq_cst + ret half %res +} + +define double @test_atomicrmw_fmax_f64_flat(double* %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f64_flat( +; GCN-NEXT: [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast double* [[PTR]] to i64* +; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[TMP2]] to i64 +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i64* [[TMP3]], i64 [[TMP5]], i64 [[TMP4]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[TMP7]] +; + %res = atomicrmw fmax double* %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global(double addrspace(1)* %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f64_global( +; GCN-NEXT: [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)* +; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[TMP2]] to i64 +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i64 addrspace(1)* [[TMP3]], i64 [[TMP5]], i64 [[TMP4]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[TMP7]] +; + %res = atomicrmw fmax double addrspace(1)* %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmax_f64_local(double addrspace(3)* %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f64_local( +; GCN-NEXT: [[TMP1:%.*]] = load double, double addrspace(3)* [[PTR:%.*]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast double addrspace(3)* [[PTR]] to i64 addrspace(3)* +; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[TMP2]] to i64 +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i64 addrspace(3)* [[TMP3]], i64 [[TMP5]], i64 [[TMP4]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[TMP7]] +; + %res = atomicrmw fmax double addrspace(3)* %ptr, double %value seq_cst + ret double %res +} Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll @@ -0,0 +1,273 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -atomic-expand %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s | FileCheck -check-prefix=GCN %s + +define float @test_atomicrmw_fmin_f32_flat(float* %ptr, float %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f32_flat( +; GCN-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast float* [[PTR]] to i32* +; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[TMP2]] to i32 +; GCN-NEXT: [[TMP5:%.*]] = bitcast float [[LOADED]] to i32 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i32* [[TMP3]], i32 [[TMP5]], i32 [[TMP4]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i32 [[NEWLOADED]] to float +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret float [[TMP7]] +; + %res = atomicrmw fmin float* %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global(float addrspace(1)* %ptr, float %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f32_global( +; GCN-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[TMP2]] to i32 +; GCN-NEXT: [[TMP5:%.*]] = bitcast float [[LOADED]] to i32 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(1)* [[TMP3]], i32 [[TMP5]], i32 [[TMP4]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i32 [[NEWLOADED]] to float +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret float [[TMP7]] +; + %res = atomicrmw fmin float addrspace(1)* %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fmin_f32_local(float addrspace(3)* %ptr, float %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f32_local( +; GCN-NEXT: [[TMP1:%.*]] = load float, float addrspace(3)* [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast float addrspace(3)* [[PTR]] to i32 addrspace(3)* +; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[TMP2]] to i32 +; GCN-NEXT: [[TMP5:%.*]] = bitcast float [[LOADED]] to i32 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(3)* [[TMP3]], i32 [[TMP5]], i32 [[TMP4]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i32 [[NEWLOADED]] to float +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret float [[TMP7]] +; + %res = atomicrmw fmin float addrspace(3)* %ptr, float %value seq_cst + ret float %res +} + +define half @test_atomicrmw_fmin_f16_flat(half* %ptr, half %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f16_flat( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; GCN-NEXT: [[TMP3:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[TMP5:%.*]] = call half @llvm.minnum.f16(half [[TMP4]], half [[VALUE:%.*]]) +; GCN-NEXT: [[TMP6:%.*]] = bitcast half [[TMP5]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32 +; GCN-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GCN-NEXT: [[TMP8:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GCN-NEXT: ret half [[TMP8]] +; + %res = atomicrmw fmin half* %ptr, half %value seq_cst + ret half %res +} + +define half @test_atomicrmw_fmin_f16_global(half addrspace(1)* %ptr, half %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f16_global( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(1)* @llvm.ptrmask.p1f16.i64(half addrspace(1)* [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(1)* [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(1)* [[ALIGNEDADDR]] to i32 addrspace(1)* +; GCN-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR1]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[TMP5:%.*]] = call half @llvm.minnum.f16(half [[TMP4]], half [[VALUE:%.*]]) +; GCN-NEXT: [[TMP6:%.*]] = bitcast half [[TMP5]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32 +; GCN-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GCN-NEXT: [[TMP8:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GCN-NEXT: ret half [[TMP8]] +; + %res = atomicrmw fmin half addrspace(1)* %ptr, half %value seq_cst + ret half %res +} + +define half @test_atomicrmw_fmin_f16_global_align4(half addrspace(1)* %ptr, half %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f16_global_align4( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = bitcast half addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)* +; GCN-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; GCN-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[TMP3:%.*]] = call half @llvm.minnum.f16(half [[TMP2]], half [[VALUE:%.*]]) +; GCN-NEXT: [[TMP4:%.*]] = bitcast half [[TMP3]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP4]] to i32 +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: [[TMP6:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; GCN-NEXT: ret half [[TMP6]] +; + %res = atomicrmw fmin half addrspace(1)* %ptr, half %value seq_cst, align 4 + ret half %res +} + +define half @test_atomicrmw_fmin_f16_local(half addrspace(3)* %ptr, half %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f16_local( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(3)* @llvm.ptrmask.p3f16.i64(half addrspace(3)* [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(3)* [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(3)* [[ALIGNEDADDR]] to i32 addrspace(3)* +; GCN-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(3)* [[ALIGNEDADDR1]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[TMP5:%.*]] = call half @llvm.minnum.f16(half [[TMP4]], half [[VALUE:%.*]]) +; GCN-NEXT: [[TMP6:%.*]] = bitcast half [[TMP5]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32 +; GCN-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg i32 addrspace(3)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GCN-NEXT: [[TMP8:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GCN-NEXT: ret half [[TMP8]] +; + %res = atomicrmw fmin half addrspace(3)* %ptr, half %value seq_cst + ret half %res +} + +define double @test_atomicrmw_fmin_f64_flat(double* %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f64_flat( +; GCN-NEXT: [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast double* [[PTR]] to i64* +; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[TMP2]] to i64 +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i64* [[TMP3]], i64 [[TMP5]], i64 [[TMP4]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[TMP7]] +; + %res = atomicrmw fmin double* %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global(double addrspace(1)* %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f64_global( +; GCN-NEXT: [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)* +; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[TMP2]] to i64 +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i64 addrspace(1)* [[TMP3]], i64 [[TMP5]], i64 [[TMP4]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[TMP7]] +; + %res = atomicrmw fmin double addrspace(1)* %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmin_f64_local(double addrspace(3)* %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f64_local( +; GCN-NEXT: [[TMP1:%.*]] = load double, double addrspace(3)* [[PTR:%.*]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP7:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE:%.*]]) +; GCN-NEXT: [[TMP3:%.*]] = bitcast double addrspace(3)* [[PTR]] to i64 addrspace(3)* +; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[TMP2]] to i64 +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i64 addrspace(3)* [[TMP3]], i64 [[TMP5]], i64 [[TMP4]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GCN-NEXT: [[TMP7]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[TMP7]] +; + %res = atomicrmw fmin double addrspace(3)* %ptr, double %value seq_cst + ret double %res +} Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll @@ -70,8 +70,36 @@ define half @test_atomicrmw_fsub_f16_flat(half* %ptr, half %value) { ; GCN-LABEL: @test_atomicrmw_fsub_f16_flat( -; GCN-NEXT: [[RES:%.*]] = atomicrmw fsub half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GCN-NEXT: ret half [[RES]] +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; GCN-NEXT: [[TMP3:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[NEW:%.*]] = fsub half [[TMP4]], [[VALUE:%.*]] +; GCN-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GCN-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GCN-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GCN-NEXT: ret half [[TMP7]] ; %res = atomicrmw fsub half* %ptr, half %value seq_cst ret half %res @@ -79,8 +107,36 @@ define half @test_atomicrmw_fsub_f16_global(half addrspace(1)* %ptr, half %value) { ; GCN-LABEL: @test_atomicrmw_fsub_f16_global( -; GCN-NEXT: [[RES:%.*]] = atomicrmw fsub half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GCN-NEXT: ret half [[RES]] +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(1)* @llvm.ptrmask.p1f16.i64(half addrspace(1)* [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(1)* [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(1)* [[ALIGNEDADDR]] to i32 addrspace(1)* +; GCN-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR1]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[NEW:%.*]] = fsub half [[TMP4]], [[VALUE:%.*]] +; GCN-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GCN-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GCN-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GCN-NEXT: ret half [[TMP7]] ; %res = atomicrmw fsub half addrspace(1)* %ptr, half %value seq_cst ret half %res @@ -88,8 +144,26 @@ define half @test_atomicrmw_fsub_f16_global_align4(half addrspace(1)* %ptr, half %value) { ; GCN-LABEL: @test_atomicrmw_fsub_f16_global_align4( -; GCN-NEXT: [[RES:%.*]] = atomicrmw fsub half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 4 -; GCN-NEXT: ret half [[RES]] +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = bitcast half addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)* +; GCN-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; GCN-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[NEW:%.*]] = fsub half [[TMP2]], [[VALUE:%.*]] +; GCN-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; GCN-NEXT: [[TMP4:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; GCN-NEXT: ret half [[TMP5]] ; %res = atomicrmw fsub half addrspace(1)* %ptr, half %value seq_cst, align 4 ret half %res @@ -97,8 +171,36 @@ define half @test_atomicrmw_fsub_f16_local(half addrspace(3)* %ptr, half %value) { ; GCN-LABEL: @test_atomicrmw_fsub_f16_local( -; GCN-NEXT: [[RES:%.*]] = atomicrmw fsub half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 -; GCN-NEXT: ret half [[RES]] +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call half addrspace(3)* @llvm.ptrmask.p3f16.i64(half addrspace(3)* [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint half addrspace(3)* [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half addrspace(3)* [[ALIGNEDADDR]] to i32 addrspace(3)* +; GCN-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(3)* [[ALIGNEDADDR1]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; GCN-NEXT: [[NEW:%.*]] = fsub half [[TMP4]], [[VALUE:%.*]] +; GCN-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; GCN-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg i32 addrspace(3)* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; GCN-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; GCN-NEXT: ret half [[TMP7]] ; %res = atomicrmw fsub half addrspace(3)* %ptr, half %value seq_cst ret half %res Index: llvm/test/Transforms/AtomicExpand/SPARC/partword.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/SPARC/partword.ll +++ llvm/test/Transforms/AtomicExpand/SPARC/partword.ll @@ -286,3 +286,43 @@ %ret = atomicrmw min i16* %arg, i16 %val seq_cst ret i16 %ret } + +define half @test_atomicrmw_fadd_f16(half* %ptr, half %value) { +; CHECK-LABEL: @test_atomicrmw_fadd_f16( +; CHECK-NEXT: fence seq_cst +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call half* @llvm.ptrmask.p0f16.i64(half* [[PTR:%.*]], i64 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint half* [[PTR]] to i64 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[PTRLSB]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 +; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[ALIGNEDADDR1:%.*]] = bitcast half* [[ALIGNEDADDR]] to i32* +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ALIGNEDADDR1]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED]] to half +; CHECK-NEXT: [[NEW:%.*]] = fadd half [[TMP5]], [[VALUE:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast half [[NEW]] to i16 +; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[SHIFTED2:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED2]] +; CHECK-NEXT: [[TMP7:%.*]] = cmpxchg i32* [[ALIGNEDADDR1]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED3:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[EXTRACTED4:%.*]] = trunc i32 [[SHIFTED3]] to i16 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[EXTRACTED4]] to half +; CHECK-NEXT: fence seq_cst +; CHECK-NEXT: ret half [[TMP8]] +; + %res = atomicrmw fadd half* %ptr, half %value seq_cst + ret half %res +}