Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1702,12 +1702,46 @@ return true; } +static SDValue stripBitcast(SDValue Val) { + return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; +} + +// Figure out if this is really an extract of the high 16-bits of a dword. +static bool isExtractHiElt(SDValue In, SDValue &Out) { + In = stripBitcast(In); + if (In.getOpcode() != ISD::TRUNCATE) + return false; + + SDValue Srl = In.getOperand(0); + if (Srl.getOpcode() == ISD::SRL) { + if (ConstantSDNode *ShiftAmt = dyn_cast(Srl.getOperand(1))) { + if (ShiftAmt->getZExtValue() == 16) { + Out = stripBitcast(Srl.getOperand(0)); + return true; + } + } + } + + return false; +} + +// Look through operations that obscure just looking at the low 16-bits of the +// same register. +static SDValue stripExtractLoElt(SDValue In) { + if (In.getOpcode() == ISD::TRUNCATE) { + SDValue Src = In.getOperand(0); + if (Src.getValueType().getSizeInBits() == 32) + return stripBitcast(Src); + } + + return In; +} + bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; Src = In; - // FIXME: Look for on separate components if (Src.getOpcode() == ISD::FNEG) { Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); Src = Src.getOperand(0); @@ -1716,19 +1750,28 @@ if (Src.getOpcode() == ISD::BUILD_VECTOR) { unsigned VecMods = Mods; - SDValue Lo = Src.getOperand(0); - SDValue Hi = Src.getOperand(1); + SDValue Lo = stripBitcast(Src.getOperand(0)); + SDValue Hi = stripBitcast(Src.getOperand(1)); if (Lo.getOpcode() == ISD::FNEG) { - Lo = Lo.getOperand(0); + Lo = stripBitcast(Lo.getOperand(0)); Mods ^= SISrcMods::NEG; } if (Hi.getOpcode() == ISD::FNEG) { - Hi = Hi.getOperand(0); + Hi = stripBitcast(Hi.getOperand(0)); Mods ^= SISrcMods::NEG_HI; } + if (isExtractHiElt(Lo, Lo)) + Mods |= SISrcMods::OP_SEL_0; + + if (isExtractHiElt(Hi, Hi)) + Mods |= SISrcMods::OP_SEL_1; + + Lo = stripExtractLoElt(Lo); + Hi = stripExtractLoElt(Hi); + if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { // Really a scalar input. Just select from the low half of the register to // avoid packing. @@ -1742,9 +1785,6 @@ } // Packed instructions do not have abs modifiers. - - // FIXME: Handle abs/neg of individual components. - // FIXME: Handle swizzling with op_sel Mods |= SISrcMods::OP_SEL_1; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2593,6 +2593,15 @@ SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); + DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); + + // Make sure we we do any optimizations that will make it easier to fold + // source modifiers before obscuring it with bit operations. + + // XXX - Why doesn't this get called when vector_shuffle is expanded? + if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) + return Combined; + if (const ConstantSDNode *CIdx = dyn_cast(Idx)) { SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); Index: test/CodeGen/AMDGPU/packed-op-sel.ll =================================================================== --- test/CodeGen/AMDGPU/packed-op-sel.ll +++ test/CodeGen/AMDGPU/packed-op-sel.ll @@ -181,8 +181,7 @@ ; GCN-NOT: shl ; GCN-NOT: or -; GCN: v_xor_b32_e32 [[NEG_SCALAR0:v[0-9]+]], 0x8000, [[SCALAR0]] -; GCN-NEXT: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[NEG_SCALAR0]] op_sel_hi:[1,0]{{$}} +; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { bb: %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 @@ -260,6 +259,434 @@ ret void } +; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.fneg = fsub <2 x half> , %vec2 + %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.elt1 = extractelement <2 x half> %vec2, i32 1 + %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1 + + %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1 + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}add_vector_scalar_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}} +define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4 + + %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> + %result = add <2 x i16> %vec0, %vec1.elt1.broadcast + + store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}} +define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %neg.vec2 = fsub <2 x half> , %vec2 + %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1 + %neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1 + %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1 + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_swap_vector: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} +define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> , %vec2 + + %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> , %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> , %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> , %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> , %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bitcast_fneg_f32: +; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}} +define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %f32 = load volatile float, float addrspace(3)* undef, align 4 + %neg.f32 = fsub float -0.0, %f32 + %bc = bitcast float %neg.f32 to <2 x half> + %result = fadd <2 x half> %vec0, %bc + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32: +; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}} +define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + + %f32 = load volatile float, float addrspace(3)* undef, align 4 + %neg.f32 = fsub float -0.0, %f32 + %bc = bitcast float %neg.f32 to <2 x half> + %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> + %result = fadd <2 x half> %vec0, %shuf + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}extract_from_i64: +; GCN: v_lshl_or_b32 +; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}} +define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 { +bb: + %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 + %i64 = load volatile i64, i64 addrspace(1)* undef + + %elt0 = trunc i64 %i64 to i16 + %hi = lshr i64 %i64, 16 + %elt1 = trunc i64 %hi to i16 + + %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0 + %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1 + %result = add <2 x i16> %vec0, %ins1 + store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + + +; Bitcast is final obstacle to identifying same source register +; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: _or + +; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] +; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} +define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %scalar0 = load volatile i16, i16 addrspace(1)* undef + %shl = shl i16 %scalar0, 1 + %shl.bc = bitcast i16 %shl to half + + %fadd = fadd <2 x half> %vec2, + %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + + +; Bitcast is final obstacle to identifying same source register +; GCN-LABEL: {{^}}mix_elt_types_op_sel: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: _or + +; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] +; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} +define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %scalar0 = load volatile i16, i16 addrspace(1)* undef + %scalar1 = load volatile half, half addrspace(1)* undef + %shl = shl i16 %scalar0, 1 + %shl.bc = bitcast i16 %shl to half + + %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0 + + %fadd = fadd <2 x half> %vec2, + %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 attributes #0 = { nounwind }