Index: lib/Target/R600/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -256,6 +256,7 @@ }; return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); } + case ISD::SCALAR_TO_VECTOR: case ISD::BUILD_VECTOR: { unsigned RegClassID; const AMDGPURegisterInfo *TRI = @@ -264,7 +265,8 @@ static_cast(TM.getRegisterInfo()); EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); - assert(VT.getVectorElementType().bitsEq(MVT::i32)); + EVT EltVT = VT.getVectorElementType(); + assert(EltVT.bitsEq(MVT::i32)); if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { bool UseVReg = true; for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); @@ -313,8 +315,7 @@ SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32); if (NumVectorElts == 1) { - return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, - VT.getVectorElementType(), + return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), RegClass); } @@ -323,11 +324,12 @@ // 16 = Max Num Vector Elements // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) // 1 = Vector Register Class - SmallVector RegSeqArgs(N->getNumOperands() * 2 + 1); + SmallVector RegSeqArgs(NumVectorElts * 2 + 1); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32); bool IsRegSeq = true; - for (unsigned i = 0; i < N->getNumOperands(); i++) { + unsigned NOps = N->getNumOperands(); + for (unsigned i = 0; i < NOps; i++) { // XXX: Why is this here? if (dyn_cast(N->getOperand(i))) { IsRegSeq = false; @@ -337,6 +339,20 @@ RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32); } + + if (NOps != NumVectorElts) { + // Fill in the missing undef elements if this was a scalar_to_vector. + assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); + + MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + SDLoc(N), EltVT); + for (unsigned i = NOps; i < NumVectorElts; ++i) { + RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32); + } + } + if (!IsRegSeq) break; return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), Index: lib/Target/R600/SIInstructions.td =================================================================== --- lib/Target/R600/SIInstructions.td +++ lib/Target/R600/SIInstructions.td @@ -1881,7 +1881,8 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; - +def : BitConvert ; +def : BitConvert ; def : BitConvert ; def : BitConvert ; Index: test/CodeGen/R600/bitcast.ll =================================================================== --- test/CodeGen/R600/bitcast.ll +++ test/CodeGen/R600/bitcast.ll @@ -42,3 +42,17 @@ store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 ret void } + +define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %load = load float addrspace(1)* %in, align 4 + %bc = bitcast float %load to <2 x i16> + store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { + %load = load <2 x i16> addrspace(1)* %in, align 4 + %bc = bitcast <2 x i16> %load to float + store float %bc, float addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/R600/scalar_to_vector.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/scalar_to_vector.ll @@ -0,0 +1,80 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + + +; FUNC-LABEL: @scalar_to_vector_v2i32 +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], +; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] +; SI: BUFFER_STORE_SHORT [[RESULT]] +; SI: BUFFER_STORE_SHORT [[RESULT]] +; SI: BUFFER_STORE_SHORT [[RESULT]] +; SI: BUFFER_STORE_SHORT [[RESULT]] +; SI: S_ENDPGM +define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tmp1 = load i32 addrspace(1)* %in, align 4 + %bc = bitcast i32 %tmp1 to <2 x i16> + %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> + store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @scalar_to_vector_v2f32 +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], +; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] +; SI: BUFFER_STORE_SHORT [[RESULT]] +; SI: BUFFER_STORE_SHORT [[RESULT]] +; SI: BUFFER_STORE_SHORT [[RESULT]] +; SI: BUFFER_STORE_SHORT [[RESULT]] +; SI: S_ENDPGM +define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tmp1 = load float addrspace(1)* %in, align 4 + %bc = bitcast float %tmp1 to <2 x i16> + %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> + store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed +; to produce one, but for some reason never made it to selection. + + +; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +; %tmp1 = load i32 addrspace(1)* %in, align 4 +; %bc = bitcast i32 %tmp1 to <4 x i8> + +; %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> +; store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4 +; ret void +; } + +; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0 +; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1 +; %bc = bitcast <2 x i64> %newvec1 to <4 x i32> +; %add = add <4 x i32> %bc, +; store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16 +; ret void +; } + +; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0 +; %bc = bitcast <4 x i32> %newvec0 to <8 x i16> +; %add = add <8 x i16> %bc, +; store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16 +; ret void +; } + +; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 +; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> +; %add = add <4 x i16> %bc, +; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 +; ret void +; } + +; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 +; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> +; %add = add <4 x i16> %bc, +; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 +; ret void +; }