Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -139,27 +139,6 @@ return new SIFoldOperands(); } -static bool isFoldableCopy(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: { - // If there are additional implicit register operands, this may be used for - // register indexing so the source register operand isn't simply copied. - unsigned NumOps = MI.getDesc().getNumOperands() + - MI.getDesc().getNumImplicitUses(); - - return MI.getNumOperands() == NumOps; - } - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::COPY: - return true; - default: - return false; - } -} - static bool updateOperand(FoldCandidate &Fold, const TargetRegisterInfo &TRI) { MachineInstr *MI = Fold.UseMI; @@ -936,7 +915,7 @@ tryFoldInst(TII, &MI); - if (!isFoldableCopy(MI)) { + if (!TII->isFoldableCopy(MI)) { if (IsIEEEMode || !tryFoldOMod(MI)) tryFoldClamp(MI); continue; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -222,6 +222,8 @@ areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; + bool isFoldableCopy(const MachineInstr &MI) const; + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const final; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1488,6 +1488,27 @@ } } +bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: { + // If there are additional implicit register operands, this may be used for + // register indexing so the source register operand isn't simply copied. + unsigned NumOps = MI.getDesc().getNumOperands() + + MI.getDesc().getNumImplicitUses(); + + return MI.getNumOperands() == NumOps; + } + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -51,6 +51,8 @@ std::unordered_map> SDWAOperands; + Optional foldToImm(const MachineOperand &Op) const; + public: static char ID; @@ -375,6 +377,33 @@ return true; } +Optional SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { + if (Op.isImm()) { + return Op.getImm(); + } + + // If this is not immediate then it can be copy of immediate value, e.g.: + // %vreg1 = S_MOV_B32 255; + if (Op.isReg()) { + for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { + if (!isSameReg(Op, Def)) + continue; + + const MachineInstr *DefInst = Def.getParent(); + if (!TII->isFoldableCopy(*DefInst) || !isSameBB(Op.getParent(), DefInst)) + return None; + + const MachineOperand &Copied = DefInst->getOperand(1); + if (!Copied.isImm()) + return None; + + return Copied.getImm(); + } + } + + return None; +} + void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { for (MachineInstr &MI : MBB) { unsigned Opcode = MI.getOpcode(); @@ -391,11 +420,11 @@ // from: v_lshlrev_b32_e32 v1, 16/24, v0 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - if (!Src0->isImm()) + auto Imm = foldToImm(*Src0); + if (!Imm) break; - int64_t Imm = Src0->getImm(); - if (Imm != 16 && Imm != 24) + if (*Imm != 16 && *Imm != 24) break; MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); @@ -406,13 +435,13 @@ if (Opcode == AMDGPU::V_LSHLREV_B32_e32) { auto SDWADst = make_unique( - Dst, Src1, Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); + Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); SDWAOperands[&MI] = std::move(SDWADst); ++NumSDWAPatternsFound; } else { auto SDWASrc = make_unique( - Src1, Dst, Imm == 16 ? WORD_1 : BYTE_3, false, false, + Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true); DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); SDWAOperands[&MI] = std::move(SDWASrc); @@ -433,7 +462,8 @@ // from: v_lshlrev_b16_e32 v1, 8, v0 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - if (!Src0->isImm() || Src0->getImm() != 8) + auto Imm = foldToImm(*Src0); + if (!Imm || *Imm != 8) break; MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); @@ -477,30 +507,30 @@ // 24 | 8 | BYTE_3 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (!Src1->isImm()) + auto Offset = foldToImm(*Src1); + if (!Offset) break; - int64_t Offset = Src1->getImm(); MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - if (!Src2->isImm()) + auto Width = foldToImm(*Src2); + if (!Width) break; - int64_t Width = Src2->getImm(); SdwaSel SrcSel = DWORD; - if (Offset == 0 && Width == 8) + if (*Offset == 0 && *Width == 8) SrcSel = BYTE_0; - else if (Offset == 0 && Width == 16) + else if (*Offset == 0 && *Width == 16) SrcSel = WORD_0; - else if (Offset == 0 && Width == 32) + else if (*Offset == 0 && *Width == 32) SrcSel = DWORD; - else if (Offset == 8 && Width == 8) + else if (*Offset == 8 && *Width == 8) SrcSel = BYTE_1; - else if (Offset == 16 && Width == 8) + else if (*Offset == 16 && *Width == 8) SrcSel = BYTE_2; - else if (Offset == 16 && Width == 16) + else if (*Offset == 16 && *Width == 16) SrcSel = WORD_1; - else if (Offset == 24 && Width == 8) + else if (*Offset == 24 && *Width == 8) SrcSel = BYTE_3; else break; @@ -526,11 +556,11 @@ // to SDWA src:v0 src_sel:WORD_0/BYTE_0 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - if (!Src0->isImm()) + auto Imm = foldToImm(*Src0); + if (!Imm) break; - int64_t Imm = Src0->getImm(); - if (Imm != 0x0000ffff && Imm != 0x000000ff) + if (*Imm != 0x0000ffff && *Imm != 0x000000ff) break; MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); @@ -541,7 +571,7 @@ break; auto SDWASrc = make_unique( - Src1, Dst, Imm == 0x0000ffff ? WORD_0 : BYTE_0); + Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); SDWAOperands[&MI] = std::move(SDWASrc); ++NumSDWAPatternsFound; Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -72,8 +72,9 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}} +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -92,10 +93,12 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { entry: @@ -114,14 +117,18 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL3]], v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v{{[0-9]+}} +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { entry: @@ -155,7 +162,9 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA: v_mul_f16_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: @@ -176,7 +185,8 @@ ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { entry: @@ -199,10 +209,10 @@ ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { entry: @@ -347,26 +357,28 @@ ret void } -; GCN-LABEL: {{^}}mul_add_v2i16: +; GCN-LABEL: {{^}}mul_add_shr_i32: ; NOSDWA-NOT: v_mul_u32_u24_sdwa ; NOSDWA-NOT: v_add_i32_sdwa ; SDWA-NOT: v_mul_u32_u24_sdwa ; SDWA-NOT: v_add_i32_sdwa -define amdgpu_kernel void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) { +define void @mul_add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ina, i32 addrspace(1)* %inb, i1 addrspace(1)* %incond) { entry: - %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 - %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %a = load i32, i32 addrspace(1)* %ina, align 4 + %b = load i32, i32 addrspace(1)* %inb, align 4 %cond = load i1, i1 addrspace(1)* %incond, align 4 + %shra = lshr i32 %a, 16 + %shrb = lshr i32 %b, 16 br i1 %cond, label %mul_label, label %add_label mul_label: - %mul = mul <2 x i16> %a, %b + %mul = mul i32 %shra, %shrb br label %store_label add_label: - %add = add <2 x i16> %a, %b + %add = add i32 %shra, %shrb br label %store_label store_label: - %store = phi <2 x i16> [%mul, %mul_label], [%add, %add_label] - store <2 x i16> %store, <2 x i16> addrspace(1)* %out, align 4 + %store = phi i32 [%mul, %mul_label], [%add, %add_label] + store i32 %store, i32 addrspace(1)* %out, align 4 ret void } \ No newline at end of file