diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -27,6 +27,7 @@
 def ThreadIdClass : dxil_class<"ThreadId">;
 def GroupIdClass : dxil_class<"GroupId">;
 def CBufferLoadClass : dxil_class<"CBufferLoad">;
+def CBufferLoadLegacyClass : dxil_class<"CBufferLoadLegacy">;
 def CreateHandleClass : dxil_class<"CreateHandle">;
 
 def binary_uint : dxil_category<"Binary uint">;
@@ -155,6 +156,14 @@
     dxil_param<4, "i32", "alignment", "load access alignment", 1>
   ]>;
 
+def CBufferLoadLegacy : dxil_op<"CBufferLoadLegacy", 59, CBufferLoadLegacyClass,Resources, "loads a value from a constant buffer resource", "half;float;double;i16;i32;i64;", "ro",
+ [
+    dxil_param<0, "dx.types.CBufRet", "", "the value for the constant buffer variable">,
+    dxil_param<1, "i32",  "opcode",  "DXIL opcode">,
+    dxil_param<2, "dx.types.Handle", "srv", "cbuffer handle">,
+    dxil_param<3, "i32", "regIndex", "0-based index into cbuffer instance">
+ ]>;
+
 def CreateHandle : dxil_op< "CreateHandle", 57, CreateHandleClass, Resources, "creates the handle to a resource",
   "void;", "ro",
   [
diff --git a/llvm/lib/Target/DirectX/DXILCBufferLowering.cpp b/llvm/lib/Target/DirectX/DXILCBufferLowering.cpp
--- a/llvm/lib/Target/DirectX/DXILCBufferLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILCBufferLowering.cpp
@@ -35,14 +35,304 @@
 namespace {
 struct BufAccess {
   Value *Index;      // Address when access cbuf.
+  unsigned Channel;  // Channel for legacy cbuf layout.
   Instruction *User; // The instruction which access cbuf.
 };
 
+class LegacyCBufferLayout {
+  struct LegacyStructLayout {
+    StructType *ST;
+    SmallVector<unsigned> Offsets;
+    unsigned Size;
+    std::pair<unsigned, unsigned> getElementLegacyOffset(unsigned Idx) const {
+      assert(Idx < Offsets.size() && "Invalid element idx!");
+      unsigned Offset = Offsets[Idx];
+      unsigned Ch = Offset & (RowAlign - 1);
+      return std::make_pair((Offset - Ch) / RowAlign, Ch);
+    }
+  };
+
+public:
+  LegacyCBufferLayout(const DataLayout &DL) : DL(DL) {}
+  int64_t getIndexedOffsetInType(Type *ElemTy, ArrayRef<Value *> Indices,
+                                 unsigned &Ch);
+  Value *emitGEPOffset(IRBuilder<> *Builder, GEPOperator *GEPOp, unsigned &Ch);
+
+private:
+  unsigned applyRowAlign(unsigned Offset, Type *EltTy);
+  unsigned getTypeAllocSize(Type *Ty);
+  LegacyStructLayout &getStructLayout(StructType *ST);
+  const DataLayout &DL;
+  SmallDenseMap<StructType *, LegacyStructLayout> StructLayouts;
+  // 4 Dwords align.
+  static const unsigned RowAlign = 16;
+  static unsigned align(unsigned Offset, unsigned Alignment) {
+    return (Offset + Alignment - 1) / Alignment * Alignment;
+  }
+  static unsigned alignTo4Dwords(unsigned Offset) {
+    return align(Offset, RowAlign);
+  }
+};
+
 } // namespace
 
-static void collectBufUserAccess(User *U, Value *Addr,
+unsigned LegacyCBufferLayout::applyRowAlign(unsigned Offset, Type *EltTy) {
+  unsigned AlignedOffset = alignTo4Dwords(Offset);
+
+  if (AlignedOffset == Offset)
+    return Offset;
+
+  if (isa<StructType>(EltTy) || isa<ArrayType>(EltTy))
+    return AlignedOffset;
+  unsigned Size = DL.getTypeStoreSize(EltTy);
+  if ((Offset + Size) > AlignedOffset)
+    return AlignedOffset;
+  else
+    return Offset;
+}
+
+unsigned LegacyCBufferLayout::getTypeAllocSize(Type *Ty) {
+  if (auto *ST = dyn_cast<StructType>(Ty)) {
+    LegacyStructLayout &Layout = getStructLayout(ST);
+    return Layout.Size;
+  } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+    unsigned NumElts = AT->getNumElements();
+    if (NumElts == 0)
+      return 0;
+
+    unsigned EltSize = getTypeAllocSize(AT->getElementType());
+    unsigned AlignedEltSize = alignTo4Dwords(EltSize);
+    // Each new element start 4 dwords aligned.
+    return AlignedEltSize * (NumElts - 1) + EltSize;
+  } else {
+    return DL.getTypeStoreSize(Ty);
+  }
+}
+
+LegacyCBufferLayout::LegacyStructLayout &
+LegacyCBufferLayout::getStructLayout(StructType *ST) {
+  auto it = StructLayouts.find(ST);
+  if (it != StructLayouts.end())
+    return it->second;
+
+  unsigned Offset = 0;
+  LegacyStructLayout Layout;
+  Layout.ST = ST;
+  for (Type *EltTy : ST->elements()) {
+    unsigned EltSize = getTypeAllocSize(EltTy);
+    if (unsigned ScalarSize = EltTy->getScalarSizeInBits())
+      Offset = align(Offset, ScalarSize >> 3);
+    Offset = applyRowAlign(Offset, EltTy);
+    Layout.Offsets.emplace_back(Offset);
+    Offset += EltSize;
+  }
+  Layout.Size = Offset;
+  StructLayouts[ST] = Layout;
+  return StructLayouts[ST];
+}
+
+int64_t LegacyCBufferLayout::getIndexedOffsetInType(Type *ElemTy,
+                                                    ArrayRef<Value *> Indices,
+                                                    unsigned &Ch) {
+  int64_t Result = 0;
+
+  generic_gep_type_iterator<Value *const *> GTI =
+                                                gep_type_begin(ElemTy, Indices),
+                                            GTE = gep_type_end(ElemTy, Indices);
+  // Mark if current GEP index Op is used for array.
+  // The first idxOp is already array indexing because it is like ptr[idx].
+  bool IsArrayIndexing = true;
+  bool IsVectorIndexing = false;
+  for (; GTI != GTE; ++GTI) {
+    Value *Idx = GTI.getOperand();
+    if (StructType *STy = GTI.getStructTypeOrNull()) {
+      assert(Idx->getType()->isIntegerTy(32) && "Illegal struct idx");
+      unsigned FieldNo = cast<ConstantInt>(Idx)->getZExtValue();
+
+      // Get structure layout information...
+      const auto &Layout = getStructLayout(STy);
+
+      // Add in the offset, as calculated by the structure layout info...
+      auto [Offset, Channel] = Layout.getElementLegacyOffset(FieldNo);
+      Result += Offset;
+      Ch = Channel;
+    } else {
+      // Get the array index and the size of each array element.
+      if (int64_t ArrayIdx = cast<ConstantInt>(Idx)->getSExtValue()) {
+        unsigned EltSize = getTypeAllocSize(GTI.getIndexedType());
+        // Array indexing need 4 dwords align.
+        if (IsArrayIndexing) {
+          EltSize = alignTo4Dwords(EltSize);
+          Ch = 0;
+        }
+
+        if (IsVectorIndexing) {
+          // Vector indexing only change channel, because vector will not cross
+          // a row, unless for 64bit types.
+          if (EltSize > 4) {
+            switch (ArrayIdx) {
+            case 0:
+              break;
+            case 1:
+              Ch += EltSize;
+              break;
+            case 2:
+              Result += 1;
+              break;
+            case 3:
+              Ch += EltSize;
+              Result += 1;
+              break;
+            default:
+              llvm_unreachable("invalid vector size");
+              break;
+            }
+          } else {
+            Ch += ArrayIdx * EltSize;
+          }
+        } else {
+          Result += ArrayIdx * (EltSize / RowAlign);
+        }
+      }
+    }
+    IsArrayIndexing = isa<ArrayType>(GTI.getIndexedType());
+    IsVectorIndexing = isa<VectorType>(GTI.getIndexedType());
+  }
+
+  // Ch = Result & (RowAlign - 1);
+  return Result; // - Ch;
+}
+
+Value *LegacyCBufferLayout::emitGEPOffset(IRBuilder<> *Builder,
+                                          GEPOperator *GEPOp, unsigned &Ch) {
+  Type *IntIdxTy = DL.getIndexType(GEPOp->getType());
+  Value *Result = nullptr;
+
+  // If the GEP is inbounds, we know that none of the addressing operations will
+  // overflow in a signed sense.
+  bool isInBounds = GEPOp->isInBounds();
+
+  // Build a mask for high order bits.
+  unsigned IntPtrWidth = IntIdxTy->getScalarType()->getIntegerBitWidth();
+  uint64_t PtrSizeMask =
+      std::numeric_limits<uint64_t>::max() >> (64 - IntPtrWidth);
+  // Mark if current GEP index Op is used for array.
+  // The first idxOp is already array indexing because it is like ptr+idx.
+  bool IsArrayIndexing = true;
+  bool IsVectorIndexing = false;
+  gep_type_iterator GTI = gep_type_begin(GEPOp);
+  for (User::op_iterator i = GEPOp->op_begin() + 1, e = GEPOp->op_end(); i != e;
+       ++i, ++GTI) {
+    Value *Op = *i;
+    uint64_t Size = getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
+    // Array indexing need 4 dwords align.
+    if (IsArrayIndexing) {
+      Size = alignTo4Dwords(Size);
+      Ch = 0;
+    }
+    if (IsVectorIndexing) {
+      // Vector indexing only change channel, because vector will not cross
+      // a row, unless for double.
+      // FIXME: take care dynamic indexing on vector inside cbuffer.
+      Constant *OpC = cast<Constant>(Op);
+      unsigned ArrayIdx = OpC->getUniqueInteger().getZExtValue();
+      if (Size > 4) {
+        switch (ArrayIdx) {
+        case 0:
+          break;
+        case 1:
+          Ch += Size;
+          break;
+        case 2:
+          Result += 1;
+          break;
+        case 3:
+          Ch += Size;
+          Result += 1;
+          break;
+        default:
+          llvm_unreachable("invalid vector size");
+          break;
+        }
+      } else {
+        Ch += OpC->getUniqueInteger().getZExtValue() * (Size / RowAlign);
+      }
+      continue;
+    }
+
+    Value *Offset;
+    if (Constant *OpC = dyn_cast<Constant>(Op)) {
+      if (OpC->isZeroValue())
+        continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        Ch = 0;
+        uint64_t OpValue = OpC->getUniqueInteger().getZExtValue();
+        // Get structure layout information...
+        const auto &Layout = getStructLayout(STy);
+        auto [EltOffset, Channel] = Layout.getElementLegacyOffset(OpValue);
+        Ch = Channel;
+        if (!EltOffset)
+          continue;
+
+        Offset = ConstantInt::get(IntIdxTy, EltOffset);
+      } else {
+        // Splat the constant if needed.
+        if (IntIdxTy->isVectorTy() && !OpC->getType()->isVectorTy())
+          OpC = ConstantVector::getSplat(
+              cast<VectorType>(IntIdxTy)->getElementCount(), OpC);
+
+        Constant *Scale = ConstantInt::get(IntIdxTy, Size);
+        // >> 4 for 4 dwords each row.
+        Scale = ConstantExpr::getLShr(Scale, ConstantInt::get(IntIdxTy, 4));
+        Constant *OC =
+            ConstantExpr::getIntegerCast(OpC, IntIdxTy, true /*SExt*/);
+        Offset =
+            ConstantExpr::getMul(OC, Scale, false /*NUW*/, isInBounds /*NSW*/);
+      }
+    } else {
+      // Splat the index if needed.
+      if (IntIdxTy->isVectorTy() && !Op->getType()->isVectorTy())
+        Op = Builder->CreateVectorSplat(
+            cast<FixedVectorType>(IntIdxTy)->getNumElements(), Op);
+
+      // Convert to correct type.
+      if (Op->getType() != IntIdxTy)
+        Op = Builder->CreateIntCast(Op, IntIdxTy, true,
+                                    Op->getName().str() + ".c");
+
+      // >> 4 for 4 dwords each row.
+      Size >>= 4;
+      Constant *Scale = ConstantInt::get(IntIdxTy, Size);
+
+      if (Size != 1) {
+        // We'll let instcombine(mul) convert this to a shl if possible.
+        Op = Builder->CreateMul(Op, Scale, GEPOp->getName().str() + ".idx",
+                                false /*NUW*/, isInBounds /*NSW*/);
+      }
+      Offset = Op;
+    }
+
+    if (Result)
+      Result =
+          Builder->CreateAdd(Result, Offset, GEPOp->getName().str() + ".offs",
+                             false /*NUW*/, isInBounds /*NSW*/);
+    else
+      Result = Offset;
+    IsArrayIndexing = isa<ArrayType>(GTI.getIndexedType());
+    IsVectorIndexing = isa<VectorType>(GTI.getIndexedType());
+  }
+  return Result ? Result : Constant::getNullValue(IntIdxTy);
+}
+
+static void collectBufUserAccess(User *U, Value *Addr, unsigned Channel,
                                  std::vector<BufAccess> &AccessList,
-                                 const DataLayout &DL) {
+                                 LegacyCBufferLayout &LegacyCBLayout) {
+  bool IsZeroAddr = false;
+  if (auto *CAddr = dyn_cast<ConstantInt>(Addr))
+    IsZeroAddr = CAddr->getLimitedValue() == 0;
+
   if (auto *GEP = dyn_cast<GEPOperator>(U)) {
     // Calculate new Addr.
     Value *NewAddr = Addr;
@@ -51,34 +341,42 @@
       B.SetInsertPoint(Inst);
     if (GEP->hasAllConstantIndices()) {
       SmallVector<Value *> IdxList(GEP->idx_begin(), GEP->idx_end());
-      NewAddr = B.CreateAdd(Addr, B.getInt32(DL.getIndexedOffsetInType(
-                                      GEP->getSourceElementType(), IdxList)));
+      unsigned Index = LegacyCBLayout.getIndexedOffsetInType(
+          GEP->getSourceElementType(), IdxList, Channel);
+      if (IsZeroAddr)
+        NewAddr = B.getInt32(Index);
+      else
+        NewAddr = B.CreateAdd(Addr, B.getInt32(Index));
     } else {
-      Value *Offset = EmitGEPOffset(&B, DL, GEP, /*NoAssumptions=*/true);
-      NewAddr = B.CreateAdd(Addr, Offset);
+      Value *Offset = LegacyCBLayout.emitGEPOffset(&B, GEP, Channel);
+      if (IsZeroAddr)
+        NewAddr = Offset;
+      else
+        NewAddr = B.CreateAdd(Addr, Offset);
     }
 
     for (User *GEPU : GEP->users()) {
-      collectBufUserAccess(GEPU, NewAddr, AccessList, DL);
+      collectBufUserAccess(GEPU, NewAddr, Channel, AccessList, LegacyCBLayout);
     }
   } else if (isa<BitCastOperator>(U) || isa<AddrSpaceCastOperator>(U)) {
     for (User *AU : U->users()) {
-      collectBufUserAccess(AU, Addr, AccessList, DL);
+      collectBufUserAccess(AU, Addr, Channel, AccessList, LegacyCBLayout);
     }
   } else if (auto *LI = dyn_cast<LoadInst>(U)) {
-    BufAccess Access = {Addr, LI};
+    BufAccess Access = {Addr, Channel, LI};
     AccessList.emplace_back(Access);
   } else
     llvm_unreachable("unsupported user");
 }
 
-static std::vector<BufAccess> collectBufAccess(GlobalVariable *GV,
-                                               const DataLayout &DL) {
+static std::vector<BufAccess>
+collectBufAccess(GlobalVariable *GV, LegacyCBufferLayout &LegacyCBLayout) {
   auto &Ctx = GV->getContext();
   Value *OffsetZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
   std::vector<BufAccess> AccessList;
   for (User *U : GV->users()) {
-    collectBufUserAccess(U, OffsetZero, AccessList, DL);
+    unsigned Channel = 0;
+    collectBufUserAccess(U, OffsetZero, Channel, AccessList, LegacyCBLayout);
   }
   return AccessList;
 }
@@ -90,14 +388,13 @@
   if (!ResTable)
     return false;
   const DataLayout &DL = M.getDataLayout();
+  LegacyCBufferLayout LegacyCBLayout(DL);
 
   for (auto *Res : ResTable->operands()) {
     assert(Res->getNumOperands() == 5 && "invalid resource metadata");
     auto *GVMD = cast<ValueAsMetadata>(Res->getOperand(0).get());
     auto *GV = cast<GlobalVariable>(GVMD->getValue());
-    assert(GV->getAddressSpace() == DXIL::CBufferAddrSpace &&
-           "invalid global variable for cbuffer");
-    std::vector<BufAccess> AccessList = collectBufAccess(GV, DL);
+    std::vector<BufAccess> AccessList = collectBufAccess(GV, LegacyCBLayout);
 
     SmallDenseMap<Function *, CallInst *> HandleMap;
     uint64_t RangeID =
@@ -127,30 +424,45 @@
 
       IRBuilder<> B(LI);
       DXILOpBuilder DXILB(M, B);
+      unsigned Ch = Access.Channel;
+      unsigned EltSizeInBytes = Ty->getScalarSizeInBits() >> 3;
+      Ch /= EltSizeInBytes;
       if (Ty->isIntegerTy() || Ty->isFloatingPointTy()) {
-        CBLd = DXILB.createCBufferLoad(Ty, Hdl, Index,
-                                       DL.getPrefTypeAlign(Ty).value());
+        CBLd = DXILB.createCBufferLoadLegacy(Ty, Hdl, Index);
+        CBLd = B.CreateExtractValue(CBLd, Ch);
       } else if (isa<VectorType>(Ty)) {
         // Only support fixed vectory type.
         auto *VT = cast<FixedVectorType>(Ty);
-        VT;
         Value *Result = PoisonValue::get(VT);
         Type *EltTy = VT->getElementType();
-        uint64_t Align = DL.getPrefTypeAlign(EltTy).value();
-        for (unsigned i = 0; i < VT->getNumElements(); ++i) {
-          Value *Offset =
-              B.CreateAdd(Index, B.getInt32(i * DL.getTypeAllocSize(EltTy)));
-          Value *Elt = DXILB.createCBufferLoad(EltTy, Hdl, Offset, Align);
-          Result = B.CreateInsertElement(Result, Elt, i);
+
+        CBLd = DXILB.createCBufferLoadLegacy(EltTy, Hdl, Index);
+        unsigned EltSize = DL.getTypeAllocSize(EltTy).getFixedSize();
+        // Cross 4 Dwords.
+        if (EltSize > 4 && VT->getNumElements() > 2) {
+          for (unsigned i = 0; i < 2; ++i) {
+            Value *Elt = B.CreateExtractValue(CBLd, (i + Ch));
+            Result = B.CreateInsertElement(Result, Elt, i);
+          }
+          Index = B.CreateAdd(Index, ConstantInt::get(Index->getType(), 1));
+          CBLd = DXILB.createCBufferLoadLegacy(EltTy, Hdl, Index);
+          for (unsigned i = 0; i < VT->getNumElements() - 2; ++i) {
+            Value *Elt = B.CreateExtractValue(CBLd, (i + Ch));
+            Result = B.CreateInsertElement(Result, Elt, i + 2);
+          }
+        } else {
+          for (unsigned i = 0; i < VT->getNumElements(); ++i) {
+            Value *Elt = B.CreateExtractValue(CBLd, (i + Ch));
+            Result = B.CreateInsertElement(Result, Elt, i);
+          }
         }
         CBLd = Result;
       } else {
+        llvm_unreachable("fail to SROA");
       }
       LI->replaceAllUsesWith(CBLd);
       LI->eraseFromParent();
     }
-
-    return false;
   }
   return true;
 }
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.h b/llvm/lib/Target/DirectX/DXILOpBuilder.h
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.h
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.h
@@ -39,6 +39,8 @@
                                bool NonUniformIndex);
   CallInst *createCBufferLoad(Type *OverloadTy, Value *Hdl, Value *ByteOffset,
                               uint32_t Alignment);
+  CallInst *createCBufferLoadLegacy(Type *OverloadTy, Value *Hdl,
+                                    Value *RegIndex);
 
 private:
   Module &M;
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -182,6 +182,40 @@
   return getOrCreateStructType(TypeName, FieldTypes, Ctx);
 }
 
+static StructType *getCBufRetType(Type *OverloadTy, LLVMContext &Ctx) {
+  OverloadKind Kind = getOverloadKind(OverloadTy);
+  std::string TypeName = constructOverloadTypeName(Kind, "dx.types.CBufRet.");
+  unsigned OverloadSize = 0;
+  if (OverloadTy->isFloatTy())
+    OverloadSize = 32;
+  else if (OverloadTy->isDoubleTy())
+    OverloadSize = 64;
+  else if (OverloadTy->isHalfTy())
+    OverloadSize = 16;
+  else
+    OverloadSize = cast<IntegerType>(OverloadTy)->getBitWidth();
+
+  switch (OverloadSize) {
+  case 32: {
+    Type *FieldTypes[4] = {OverloadTy, OverloadTy, OverloadTy, OverloadTy};
+    return getOrCreateStructType(TypeName, FieldTypes, Ctx);
+  }
+  case 64: {
+    Type *FieldTypes[2] = {OverloadTy, OverloadTy};
+    return getOrCreateStructType(TypeName, FieldTypes, Ctx);
+  }
+  case 16: {
+    TypeName += ".8"; // dx.types.CBufRet.fp16.8 for buffer of 8 halves
+    Type *FieldTypes[8] = {OverloadTy, OverloadTy, OverloadTy, OverloadTy,
+                           OverloadTy, OverloadTy, OverloadTy, OverloadTy};
+    return getOrCreateStructType(TypeName, FieldTypes, Ctx);
+  }
+  default:
+    llvm_unreachable("invalid Overload type");
+    return nullptr;
+  }
+}
+
 static StructType *getHandleType(LLVMContext &Ctx) {
   return getOrCreateStructType("dx.types.Handle", Type::getInt8PtrTy(Ctx), Ctx);
 }
@@ -213,6 +247,8 @@
     return getResRetType(OverloadTy, Ctx);
   case ParameterKind::DXIL_HANDLE:
     return getHandleType(Ctx);
+  case ParameterKind::CBUFFER_RET:
+    return getCBufRetType(OverloadTy, Ctx);
   default:
     break;
   }
@@ -339,5 +375,13 @@
                            ByteOffset, B.getInt32(Alignment)});
 }
 
+CallInst *DXILOpBuilder::createCBufferLoadLegacy(Type *OverloadTy, Value *Hdl,
+                                                 Value *RegIndex) {
+  auto Fn =
+      getOrCreateDXILOpFunction(DXIL::OpCode::CBufferLoadLegacy, OverloadTy, M);
+  return B.CreateCall(Fn, {B.getInt32((int32_t)DXIL::OpCode::CBufferLoadLegacy),
+                           Hdl, RegIndex});
+}
+
 } // namespace DXIL
 } // namespace llvm
diff --git a/llvm/test/CodeGen/DirectX/cbuf.ll b/llvm/test/CodeGen/DirectX/cbuf.ll
deleted file mode 100644
--- a/llvm/test/CodeGen/DirectX/cbuf.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: opt -S -dxil-cbuf-lower < %s | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-unknown-shadermodel6.7-library"
-
-; Make sure generate create handle.
-; CHECK:%[[HDL:.+]] = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 2, i1 false)
-
-; Make sure load at offset 0/8/20 for @A.cb.float/double/<2 x i32>.y
-; CHECK:call float @dx.op.cbufferLoad.f32(i32 58, %dx.types.Handle %[[HDL]], i32 0, i32 4)
-; CHECK:call double @dx.op.cbufferLoad.f64(i32 58, %dx.types.Handle %[[HDL]], i32 8, i32 8)
-; CHECK:call i32 @dx.op.cbufferLoad.i32(i32 58, %dx.types.Handle %[[HDL]], i32 20, i32 4)
-@A.cb. = external addrspace(4) constant { float, i32, double, <2 x i32> }
-
-; Function Attrs: noinline nounwind optnone
-define noundef float @"?foo@@YAMXZ"() #0 {
-entry:
-  %0 = load float, ptr addrspacecast (ptr addrspace(4) @A.cb. to ptr), align 4
-  %conv = fpext float %0 to double
-  %1 = load double, ptr addrspacecast (ptr addrspace(4) getelementptr inbounds ({ float, i32, double, <2 x i32> }, ptr addrspace(4) @A.cb., i32 0, i32 2) to ptr), align 8
-  %2 = load <2 x i32>, ptr addrspacecast (ptr addrspace(4) getelementptr inbounds ({ float, i32, double, <2 x i32> }, ptr addrspace(4) @A.cb., i32 0, i32 3) to ptr), align 8
-  %3 = extractelement <2 x i32> %2, i32 1
-  %conv1 = sitofp i32 %3 to double
-  %4 = call double @llvm.fmuladd.f64(double %1, double %conv1, double %conv)
-  %conv2 = fptrunc double %4 to float
-  ret float %conv2
-}
-
-; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
-declare double @llvm.fmuladd.f64(double, double, double) #1
-
-attributes #0 = { noinline nounwind }
-attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
-
-!hlsl.cbufs = !{!1}
-
-!1 = !{ptr addrspace(4) @A.cb., !"A.cb.ty", i32 0, i32 2, i32 1}
diff --git a/llvm/test/CodeGen/DirectX/legacy_cb_layout_0.ll b/llvm/test/CodeGen/DirectX/legacy_cb_layout_0.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/legacy_cb_layout_0.ll
@@ -0,0 +1,105 @@
+; RUN: opt -S -dxil-cbuf-lower < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-unknown-shadermodel6.7-library"
+
+@A.cb. = external local_unnamed_addr constant { float, double, float, half, i16, i64, i32 }
+
+; Make sure first float is load from cb[0].x.
+; CHECK:float @fooA0()
+; CHECK-NEXT:entry
+; CHECK-NEXT:%0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT:%1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 0)
+; CHECK-NEXT:%2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT:ret float %2
+define noundef float @fooA0() local_unnamed_addr {
+entry:
+  %0 = load float, ptr @A.cb., align 8
+  ret float %0
+}
+
+; Make sure double is load from cb[0].zw.
+; CHECK:double @fooA1()
+; CHECK-NEXT:entry
+; CHECK-NEXT:%0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT:%1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 0)
+; CHECK-NEXT:%2 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT:ret double %2
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define noundef double @fooA1() local_unnamed_addr {
+entry:
+  %0 = load double, ptr getelementptr inbounds ({ float, double, float, half, i16, i64, i32 }, ptr @A.cb., i32 0, i32 1), align 8
+  ret double %0
+}
+
+; Make sure second float is load from cb[1].x.
+; CHECK:float @fooA2()
+; CHECK-NEXT:entry
+; CHECK-NEXT:%0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT:%1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 1)
+; CHECK-NEXT:%2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT:ret float %2
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define noundef float @fooA2() local_unnamed_addr {
+entry:
+  %0 = load float, ptr getelementptr inbounds ({ float, double, float, half, i16, i64, i32 }, ptr @A.cb., i32 0, i32 2), align 8
+  ret float %0
+}
+
+; Make sure half is load from low16bit of cb[1].y.
+; CHECK:half @fooA3()
+; CHECK-NEXT:entry
+; CHECK-NEXT:%0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT:%1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 1)
+; CHECK-NEXT:%2 = extractvalue %dx.types.CBufRet.f16.8 %1, 2
+; CHECK-NEXT:ret half %2
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define noundef half @fooA3() local_unnamed_addr {
+entry:
+  %0 = load half, ptr getelementptr inbounds ({ float, double, float, half, i16, i64, i32 }, ptr @A.cb., i32 0, i32 3), align 4
+  ret half %0
+}
+
+; Make sure i16 is load from high16bit of cb[1].y.
+; CHECK:signext i16 @fooA4()
+; CHECK-NEXT:entry
+; CHECK-NEXT:%0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT:%1 = call %dx.types.CBufRet.i16.8 @dx.op.cbufferLoadLegacy.i16(i32 59, %dx.types.Handle %0, i32 1)
+; CHECK-NEXT:%2 = extractvalue %dx.types.CBufRet.i16.8 %1, 3
+; CHECK-NEXT:ret i16 %2
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define noundef signext i16 @fooA4() local_unnamed_addr {
+entry:
+  %0 = load i16, ptr getelementptr inbounds ({ float, double, float, half, i16, i64, i32 }, ptr @A.cb., i32 0, i32 4), align 2
+  ret i16 %0
+}
+
+; Make sure i64 is load from cb[1].zw.
+; CHECK:i64 @fooA5()
+; CHECK-NEXT:entry
+; CHECK-NEXT:%0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT:%1 = call %dx.types.CBufRet.i64 @dx.op.cbufferLoadLegacy.i64(i32 59, %dx.types.Handle %0, i32 1)
+; CHECK-NEXT:%2 = extractvalue %dx.types.CBufRet.i64 %1, 1
+; CHECK-NEXT:ret i64 %2
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define noundef i64 @fooA5() local_unnamed_addr {
+entry:
+  %0 = load i64, ptr getelementptr inbounds ({ float, double, float, half, i16, i64, i32 }, ptr @A.cb., i32 0, i32 5), align 8
+  ret i64 %0
+}
+
+; Make sure i32 is load from cb[2].x.
+; CHECK:i32 @fooA6()
+; CHECK-NEXT:entry
+; CHECK-NEXT:%0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT:%1 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %0, i32 2)
+; CHECK-NEXT:%2 = extractvalue %dx.types.CBufRet.i32 %1, 0
+; CHECK-NEXT:ret i32 %2
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define noundef i32 @fooA6() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr getelementptr inbounds ({ float, double, float, half, i16, i64, i32 }, ptr @A.cb., i32 0, i32 6), align 8
+  ret i32 %0
+}
+
+!hlsl.cbufs = !{!0}
+!0 = !{ptr @A.cb., !"A.cb.ty", i32 0, i32 -1, i32 0}
diff --git a/llvm/test/CodeGen/DirectX/legacy_cb_layout_1.ll b/llvm/test/CodeGen/DirectX/legacy_cb_layout_1.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/legacy_cb_layout_1.ll
@@ -0,0 +1,150 @@
+; RUN: opt -S -dxil-cbuf-lower < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-unknown-shadermodel6.7-library"
+
+@B.cb. = external local_unnamed_addr constant { double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }
+
+; Make sure first double load from C[0].xy
+; CHECK:double @fooB0()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 0)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: ret double %2
+define noundef double @fooB0() local_unnamed_addr {
+entry:
+  %0 = load double, ptr @B.cb., align 32
+  ret double %0
+}
+
+; Make sure <3 x float> load from C[1].xyz
+; CHECK:<3 x float> @fooB1()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: %3 = insertelement <3 x float> poison, float %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f32 %1, 1
+; CHECK-NEXT: %5 = insertelement <3 x float> %3, float %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f32 %1, 2
+; CHECK-NEXT: %7 = insertelement <3 x float> %5, float %6, i64 2
+; CHECK-NEXT: ret <3 x float> %7
+define noundef <3 x float> @fooB1() local_unnamed_addr {
+entry:
+  %0 = load <3 x float>, ptr getelementptr inbounds ({ double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }, ptr @B.cb., i32 0, i32 1), align 16
+  ret <3 x float> %0
+}
+
+; Make sure first float load from C[1].w
+; CHECK: float @fooB2()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 3
+; CHECK-NEXT: ret float %2
+define noundef float @fooB2() local_unnamed_addr {
+entry:
+  %0 = load float, ptr getelementptr inbounds ({ double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }, ptr @B.cb., i32 0, i32 2), align 32
+  ret float %0
+}
+
+; Make sure <3 x double> load from C[2].xyzw and C[3].xy
+; CHECK: <3 x double> @fooB3()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 2)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: %3 = insertelement <3 x double> poison, double %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: %5 = insertelement <3 x double> %3, double %4, i64 1
+; CHECK-NEXT: %6 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 3)
+; CHECK-NEXT: %7 = extractvalue %dx.types.CBufRet.f64 %6, 0
+; CHECK-NEXT: %8 = insertelement <3 x double> %5, double %7, i64 2
+; CHECK-NEXT: ret <3 x double> %8
+define noundef <3 x double> @fooB3() local_unnamed_addr {
+entry:
+  %0 = load <3 x double>, ptr getelementptr inbounds ({ double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }, ptr @B.cb., i32 0, i32 3), align 32
+  ret <3 x double> %0
+}
+
+; Make sure half load from low16bit of C[3].z
+;CHECK: half @fooB4()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 3)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 4
+; CHECK-NEXT: ret half %2
+define noundef half @fooB4() local_unnamed_addr {
+entry:
+  %0 = load half, ptr getelementptr inbounds ({ double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }, ptr @B.cb., i32 0, i32 4), align 32
+  ret half %0
+}
+
+; Make sure <2 x double> load from C[4].xyzw
+; CHECK: <2 x double> @fooB5()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 4)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: %3 = insertelement <2 x double> poison, double %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: %5 = insertelement <2 x double> %3, double %4, i64 1
+; CHECK-NEXT: ret <2 x double> %5
+define noundef <2 x double> @fooB5() local_unnamed_addr {
+entry:
+  %0 = load <2 x double>, ptr getelementptr inbounds ({ double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }, ptr @B.cb., i32 0, i32 5), align 16
+  ret <2 x double> %0
+}
+
+; Make sure second float load from C[5].x
+; CHECK:float @fooB6()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 5)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: ret float %2
+define noundef float @fooB6() local_unnamed_addr {
+entry:
+  %0 = load float, ptr getelementptr inbounds ({ double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }, ptr @B.cb., i32 0, i32 6), align 32
+  ret float %0
+}
+
+; Make sure first <3 x half> load from C[5].y and low 16bit of C[5].z
+;CHECK: <3 x half> @fooB7()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 5)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 2
+; CHECK-NEXT: %3 = insertelement <3 x half> poison, half %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f16.8 %1, 3
+; CHECK-NEXT: %5 = insertelement <3 x half> %3, half %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f16.8 %1, 4
+; CHECK-NEXT: %7 = insertelement <3 x half> %5, half %6, i64 2
+; CHECK-NEXT: ret <3 x half> %7
+define noundef <3 x half> @fooB7() local_unnamed_addr {
+entry:
+  %0 = load <3 x half>, ptr getelementptr inbounds ({ double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }, ptr @B.cb., i32 0, i32 7), align 8
+  ret <3 x half> %0
+}
+
+; Make sure second <3 x half> load from high 16bit of C[5].z and C[5].w
+;CHECK: <3 x half> @fooB8()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 5)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 5
+; CHECK-NEXT: %3 = insertelement <3 x half> poison, half %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f16.8 %1, 6
+; CHECK-NEXT: %5 = insertelement <3 x half> %3, half %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f16.8 %1, 7
+; CHECK-NEXT: %7 = insertelement <3 x half> %5, half %6, i64 2
+; CHECK-NEXT: ret <3 x half> %7
+define noundef <3 x half> @fooB8() local_unnamed_addr {
+entry:
+  %0 = load <3 x half>, ptr getelementptr inbounds ({ double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }, ptr @B.cb., i32 0, i32 8), align 16
+  ret <3 x half> %0
+}
+
+!hlsl.cbufs = !{!0}
+
+!0 = !{ptr @B.cb., !"B.cb.ty", i32 0, i32 -1, i32 0}
diff --git a/llvm/test/CodeGen/DirectX/legacy_cb_layout_2.ll b/llvm/test/CodeGen/DirectX/legacy_cb_layout_2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/legacy_cb_layout_2.ll
@@ -0,0 +1,203 @@
+; RUN: opt -S -dxil-cbuf-lower < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-unknown-shadermodel6.7-library"
+
+@B.cb. = external local_unnamed_addr constant { [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }
+@B.cb..1 = external local_unnamed_addr constant { [3 x <3 x double>], <3 x half> }
+
+; Make sure indexing [2 x double] from C[0].xy
+; CHECK: double @fooB0(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %arrayidx = getelementptr inbounds [2 x double], ptr @B.cb., i32 0, i32 %i
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 %i)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: ret double %2
+define noundef double @fooB0(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %arrayidx = getelementptr inbounds [2 x double], ptr @B.cb., i32 0, i32 %i
+  %0 = load double, ptr %arrayidx, align 8
+  ret double %0
+}
+
+; Make sure indexing [3 x <3 x float>] from C[2].xyz
+; CHECK: <3 x float> @fooB1(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %arrayidx.offs = add nsw i32 2, %i
+; CHECK-NEXT: %arrayidx = getelementptr inbounds { [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 1, i32 %i
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 %arrayidx.offs)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: %3 = insertelement <3 x float> poison, float %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f32 %1, 1
+; CHECK-NEXT: %5 = insertelement <3 x float> %3, float %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f32 %1, 2
+; CHECK-NEXT: %7 = insertelement <3 x float> %5, float %6, i64 2
+; CHECK-NEXT: ret <3 x float> %7
+define noundef <3 x float> @fooB1(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %arrayidx = getelementptr inbounds { [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 1, i32 %i
+  %0 = load <3 x float>, ptr %arrayidx, align 16
+  ret <3 x float> %0
+}
+
+; Make sure load first float from C[4].z
+; CHECK: float @fooB2()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 4)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 3
+; CHECK-NEXT: ret float %2
+define noundef float @fooB2() local_unnamed_addr  {
+entry:
+  %0 = load float, ptr getelementptr inbounds ({ [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 2), align 16
+  ret float %0
+}
+
+; Make sure indexing [3 x double] from C[5].xy.
+; CHECK: double @fooB3(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %arrayidx.offs = add nsw i32 5, %i
+; CHECK-NEXT: %arrayidx = getelementptr inbounds { [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 3, i32 %i
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 %arrayidx.offs)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: ret double %2
+define noundef double @fooB3(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %arrayidx = getelementptr inbounds { [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 3, i32 %i
+  %0 = load double, ptr %arrayidx, align 8
+  ret double %0
+}
+
+; Make sure load half from low16bit of C[7].z
+; CHECK:half @fooB4()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 7)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 4
+; CHECK-NEXT: ret half %2
+define noundef half @fooB4() local_unnamed_addr  {
+entry:
+  %0 = load half, ptr getelementptr inbounds ({ [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 4), align 16
+  ret half %0
+}
+
+; Make sure indexing [1 x <2 x double>] from C[8].xy
+; CHECK: <2 x double> @fooB5(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %arrayidx.offs = add nsw i32 8, %i
+; CHECK-NEXT: %arrayidx = getelementptr inbounds { [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 5, i32 %i
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 %arrayidx.offs)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: %3 = insertelement <2 x double> poison, double %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: %5 = insertelement <2 x double> %3, double %4, i64 1
+; CHECK-NEXT: ret <2 x double> %5
+define noundef <2 x double> @fooB5(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %arrayidx = getelementptr inbounds { [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 5, i32 %i
+  %0 = load <2 x double>, ptr %arrayidx, align 16
+  ret <2 x double> %0
+}
+
+; Make sure second fload load from C[9].x
+; CHECK: float @fooB6()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 9)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: ret float %2
+define noundef float @fooB6() local_unnamed_addr  {
+entry:
+  %0 = load float, ptr getelementptr inbounds ({ [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 6), align 16
+  ret float %0
+}
+
+; Make sure indexing [2 x <3 x half>] from C[10].x and low 16bit of C[10].y 
+; CHECK: <3 x half> @fooB7(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %arrayidx.offs = add nsw i32 10, %i
+; CHECK-NEXT: %arrayidx = getelementptr inbounds { [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 7, i32 %i
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 %arrayidx.offs)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 0
+; CHECK-NEXT: %3 = insertelement <3 x half> poison, half %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f16.8 %1, 1
+; CHECK-NEXT: %5 = insertelement <3 x half> %3, half %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f16.8 %1, 2
+; CHECK-NEXT: %7 = insertelement <3 x half> %5, half %6, i64 2
+; CHECK-NEXT: ret <3 x half> %7
+define noundef <3 x half> @fooB7(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %arrayidx = getelementptr inbounds { [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 7, i32 %i
+  %0 = load <3 x half>, ptr %arrayidx, align 8
+  ret <3 x half> %0
+}
+
+; Make sure load half3 from high 16bit of C[11].y and C[11].z
+; CHECK: <3 x half> @fooB8()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 11)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 3
+; CHECK-NEXT: %3 = insertelement <3 x half> poison, half %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f16.8 %1, 4
+; CHECK-NEXT: %5 = insertelement <3 x half> %3, half %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f16.8 %1, 5
+; CHECK-NEXT: %7 = insertelement <3 x half> %5, half %6, i64 2
+; CHECK-NEXT: ret <3 x half> %7
+define noundef <3 x half> @fooB8() local_unnamed_addr  {
+entry:
+  %0 = load <3 x half>, ptr getelementptr inbounds ({ [2 x double], [3 x <3 x float>], float, [3 x double], half, [1 x <2 x double>], float, [2 x <3 x half>], <3 x half> }, ptr @B.cb., i32 0, i32 8), align 8
+  ret <3 x half> %0
+}
+
+; Make sure indexing [3 x <3 x double>] from C1[0].
+; CHECK: <3 x double> @fooB9(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 1, i32 -1, i1 false)
+; Make sure each row is 2x16dwords.
+; CHECK-NEXT: %arrayidx.idx = mul nsw i32 %i, 2
+; CHECK-NEXT: %arrayidx = getelementptr inbounds [3 x <3 x double>], ptr @B.cb..1, i32 0, i32 %i
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 %arrayidx.idx)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: %3 = insertelement <3 x double> poison, double %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: %5 = insertelement <3 x double> %3, double %4, i64 1
+; Next row for z of <3 x double>
+; CHECK-NEXT: %6 = add i32 %arrayidx.idx, 1
+; CHECK-NEXT: %7 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 %6)
+; CHECK-NEXT: %8 = extractvalue %dx.types.CBufRet.f64 %7, 0
+; CHECK-NEXT: %9 = insertelement <3 x double> %5, double %8, i64 2
+; CHECK-NEXT: ret <3 x double> %9
+define noundef <3 x double> @fooB9(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %arrayidx = getelementptr inbounds [3 x <3 x double>], ptr @B.cb..1, i32 0, i32 %i
+  %loadVec3 = load <3 x double>, ptr %arrayidx, align 32
+  ret <3 x double> %loadVec3
+}
+
+; Make sure load half3 from C1[5].z and low 16bit of C1[5].w
+; CHECK: <3 x half> @fooB10()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 1, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 5)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 4
+; CHECK-NEXT: %3 = insertelement <3 x half> poison, half %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f16.8 %1, 5
+; CHECK-NEXT: %5 = insertelement <3 x half> %3, half %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f16.8 %1, 6
+; CHECK-NEXT: %7 = insertelement <3 x half> %5, half %6, i64 2
+; CHECK-NEXT: ret <3 x half> %7
+define noundef <3 x half> @fooB10() local_unnamed_addr  {
+entry:
+  %loadVec3 = load <3 x half>, ptr getelementptr inbounds ({ [3 x <3 x double>], <3 x half> }, ptr @B.cb..1, i32 0, i32 1), align 32
+  ret <3 x half> %loadVec3
+}
+
+!hlsl.cbufs = !{!0, !1}
+
+!0 = !{ptr @B.cb., !"B.cb.ty", i32 0, i32 -1, i32 0}
+!1 = !{ptr @B.cb..1, !"B.cb.ty", i32 1, i32 -1, i32 0}
diff --git a/llvm/test/CodeGen/DirectX/legacy_cb_layout_3.ll b/llvm/test/CodeGen/DirectX/legacy_cb_layout_3.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/legacy_cb_layout_3.ll
@@ -0,0 +1,554 @@
+; RUN: opt -S -dxil-cbuf-lower < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-unknown-shadermodel6.7-library"
+
+; struct A {
+;   float A0;  // Offset 0
+;   double A1; // Offset 0, ch zw
+;   float A2;  // Offest 1
+;   half A3;   // Offset 1, ch low16 y
+;   int16_t A4; // Offset 1, ch high16 y
+;   int64_t A5; // Offset 1, ch zw
+;   int A6;     // Offset 2,
+; };  // Next offset at 2.y
+;
+; struct B {
+;   double B0;  // Offset 0
+;   float3 B1;  // Offset 1
+;   float  B2;  // Offset 1, ch 3
+;   double3 B3; // Offset 2
+;   half B4;    // Offset 3.z low16
+;   double2 B5; // Offset 4
+;   float B6;   // Offset 5
+;   half3 B7;   // Offset 5, ch y and low16 of z
+;   half3 B8;   // Offset 5, ch high16 z and w
+; };  // Next offset 6
+;
+; struct C {
+;   A C0;          // Offest 0, size 3
+;   float C1[1];   // Offset 3,
+;   B C2[2];       // Offset 4, size 6 * 2
+;   half C3;       // Offset 16
+; };  // Next offset 16, high 16 of x
+;
+; cbuffer D {
+;   int D0; // Offset 0
+;   B D1;   // Offest 1, Size 6
+;   half D2;  // Offset 7
+;   C D3;      // Offset 8, Size 16.high 16x
+;   double D4; // Offset 24, ch zw
+; }
+
+%struct.B = type <{ double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> }>
+%struct.C = type <{ %struct.A, [1 x float], [2 x %struct.B], half }>
+%struct.A = type <{ float, double, float, half, i16, i64, i32 }>
+
+@D.cb. = external local_unnamed_addr constant { i32, %struct.B, half, %struct.C, double }
+
+; Make sure D0 load from C[0].x
+; CHECK: i32 @fooD0()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %0, i32 0)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.i32 %1, 0
+; CHECK-NEXT: ret i32 %2
+define noundef i32 @fooD0() local_unnamed_addr  {
+entry:
+  %0 = load i32, ptr @D.cb., align 8
+  ret i32 %0
+}
+
+; Make sure D2 load from low 16bit of C[7].x
+; CHECK: half @fooD2()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 7)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 0
+; CHECK-NEXT: ret half %2
+define noundef half @fooD2() local_unnamed_addr  {
+entry:
+  %0 = load half, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 2), align 2
+  ret half %0
+}
+
+; Mae sure D4 load from C[24].zw
+; CHECK: double @fooD4()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 24)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: ret double %2
+define noundef double @fooD4() local_unnamed_addr  {
+entry:
+  %0 = load double, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 4), align 8
+  ret double %0
+}
+
+; Make sure D1.B0 load from C[1].xy
+; CHECK: double @fooD1_B0()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: ret double %2
+define noundef double @fooD1_B0() local_unnamed_addr  {
+entry:
+  %0 = load double, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 1), align 4
+  ret double %0
+}
+
+; Make sure D1.B1 load from C[2].xyz
+; CHECK: <3 x float> @fooD1_B1()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 2)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: %3 = insertelement <3 x float> poison, float %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f32 %1, 1
+; CHECK-NEXT: %5 = insertelement <3 x float> %3, float %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f32 %1, 2
+; CHECK-NEXT: %7 = insertelement <3 x float> %5, float %6, i64 2
+; CHECK-NEXT: ret <3 x float> %7
+define noundef <3 x float> @fooD1_B1() local_unnamed_addr  {
+entry:
+  %0 = load <3 x float>, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 1, i32 1), align 4
+  ret <3 x float> %0
+}
+
+; Make sure D1.B2 load from C[2].w
+; CHECK: float @fooD1_B2()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 2)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 3
+; CHECK-NEXT: ret float %2
+define noundef float @fooD1_B2() local_unnamed_addr  {
+entry:
+  %0 = load float, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 1, i32 2), align 4
+  ret float %0
+}
+
+; Make sure D1.B3 load from C[3].xyzw and C[4].xy
+; CHECK: <3 x double> @fooD1_B3()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 3)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: %3 = insertelement <3 x double> poison, double %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: %5 = insertelement <3 x double> %3, double %4, i64 1
+; CHECK-NEXT: %6 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 4)
+; CHECK-NEXT: %7 = extractvalue %dx.types.CBufRet.f64 %6, 0
+; CHECK-NEXT: %8 = insertelement <3 x double> %5, double %7, i64 2
+; CHECK-NEXT: ret <3 x double> %8
+define noundef <3 x double> @fooD1_B3() local_unnamed_addr  {
+entry:
+  %0 = load <3 x double>, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 1, i32 3), align 8
+  ret <3 x double> %0
+}
+
+; Make sure D1.B4 load from low 16bit of C[4].z
+; CHECK: half @fooD1_B4()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 4)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 4
+; CHECK-NEXT: ret half %2
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define noundef half @fooD1_B4() local_unnamed_addr  {
+entry:
+  %0 = load half, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 1, i32 4), align 8
+  ret half %0
+}
+
+; Make sure D1.B5 load from C[5].xyzw
+; CHECK: <2 x double> @fooD1_B5()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 5)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: %3 = insertelement <2 x double> poison, double %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: %5 = insertelement <2 x double> %3, double %4, i64 1
+; CHECK-NEXT: ret <2 x double> %5
+define noundef <2 x double> @fooD1_B5() local_unnamed_addr  {
+entry:
+  %0 = load <2 x double>, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 1, i32 5), align 2
+  ret <2 x double> %0
+}
+
+; Make sure D1.B6 load from C[6].x
+; CHECK: float @fooD1_B6()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 6)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: ret float %2
+define noundef float @fooD1_B6() local_unnamed_addr  {
+entry:
+  %0 = load float, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 1, i32 6), align 2
+  ret float %0
+}
+
+; Make sure D1.B7 load from low 16bit of C[6].z and C[6].y
+; CHECK: <3 x half> @fooD1_B7()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 6)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 2
+; CHECK-NEXT: %3 = insertelement <3 x half> poison, half %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f16.8 %1, 3
+; CHECK-NEXT: %5 = insertelement <3 x half> %3, half %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f16.8 %1, 4
+; CHECK-NEXT: %7 = insertelement <3 x half> %5, half %6, i64 2
+; CHECK-NEXT: ret <3 x half> %7
+define noundef <3 x half> @fooD1_B7() local_unnamed_addr  {
+entry:
+  %0 = load <3 x half>, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 1, i32 7), align 2
+  ret <3 x half> %0
+}
+
+; Make sure D1.B8 load from high 16bit of C[6].z and C[6].w
+; CHECK: <3 x half> @fooD1_B8()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 6)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 5
+; CHECK-NEXT: %3 = insertelement <3 x half> poison, half %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f16.8 %1, 6
+; CHECK-NEXT: %5 = insertelement <3 x half> %3, half %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f16.8 %1, 7
+; CHECK-NEXT: %7 = insertelement <3 x half> %5, half %6, i64 2
+; CHECK-NEXT: ret <3 x half> %7
+define noundef <3 x half> @fooD1_B8() local_unnamed_addr  {
+entry:
+  %0 = load <3 x half>, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 1, i32 8), align 2
+  ret <3 x half> %0
+}
+
+; Make sure D3.C0.A0 load from C[8].x
+; CHECK: float @fooD3_C0_A0()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 8)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: ret float %2
+define noundef float @fooD3_C0_A0() local_unnamed_addr  {
+entry:
+  %0 = load float, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3), align 8
+  ret float %0
+}
+
+; Make sure D3.C0.A1 load from C[8].zw
+; CHECK: double @fooD3_C0_A1()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 8)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: ret double %2
+define noundef double @fooD3_C0_A1() local_unnamed_addr  {
+entry:
+  %0 = load double, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 0, i32 1), align 4
+  ret double %0
+}
+
+; Make sure D3.C0.A2 load from C[9].x
+; CHECK: float @fooD3_C0_A2()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 9)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: ret float %2
+define noundef float @fooD3_C0_A2() local_unnamed_addr  {
+entry:
+  %0 = load float, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 0, i32 2), align 4
+  ret float %0
+}
+
+; Make sure D3.C0.A3 load from low 16bit of C[9].y
+; CHECK: half @fooD3_C0_A3()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 9)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 2
+; CHECK-NEXT: ret half %2
+define noundef half @fooD3_C0_A3() local_unnamed_addr  {
+entry:
+  %0 = load half, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 0, i32 3), align 8
+  ret half %0
+}
+
+; Make sure D3.C0.A4 load from high 16bit of C[9].y
+; CHECK: i16 @fooD4_C0_A4()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.i16.8 @dx.op.cbufferLoadLegacy.i16(i32 59, %dx.types.Handle %0, i32 9)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.i16.8 %1, 3
+; CHECK-NEXT: ret i16 %2
+define noundef signext i16 @fooD4_C0_A4() local_unnamed_addr  {
+entry:
+  %0 = load i16, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 0, i32 4), align 2
+  ret i16 %0
+}
+
+; Make sure D3.C0.A5 load from C[9].zw
+; CHECK: i64 @fooD3_C0_A5()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.i64 @dx.op.cbufferLoadLegacy.i64(i32 59, %dx.types.Handle %0, i32 9)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.i64 %1, 1
+; CHECK-NEXT: ret i64 %2
+define noundef i64 @fooD3_C0_A5() local_unnamed_addr  {
+entry:
+  %0 = load i64, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 0, i32 5), align 4
+  ret i64 %0
+}
+
+; Make sure D3.C0.A6 load from C[10].x
+; CHECK: i32 @fooD3_C0_A6()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %0, i32 10)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.i32 %1, 0
+; CHECK-NEXT: ret i32 %2
+define noundef i32 @fooD3_C0_A6() local_unnamed_addr  {
+entry:
+  %0 = load i32, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 0, i32 6), align 4
+  ret i32 %0
+}
+
+; Make sure indexing D3.C1 from C[11].x
+; CHECK: float @fooD3_C1(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %arrayidx.offs = add nsw i32 11, %i
+; CHECK-NEXT: %arrayidx = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 1, i32 %i
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 %arrayidx.offs)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: ret float %2
+define noundef float @fooD3_C1(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %arrayidx = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 1, i32 %i
+  %0 = load float, ptr %arrayidx, align 4
+  ret float %0
+}
+
+; Make sure load D3.C3 from low 16bit of C[24].x
+; CHECK: half @fooD3_C3()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 24)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 0
+; CHECK-NEXT: ret half %2
+define noundef half @fooD3_C3() local_unnamed_addr  {
+entry:
+  %0 = load half, ptr getelementptr inbounds ({ i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 3), align 8
+  ret half %0
+}
+
+; Make sure indexing D3.C2.B0 from C[12].xy
+; CHECK: double @fooD3_C2_B0(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; Make sure struct size is 6 x 4dwords.
+; CHECK-NEXT: %arrayidx.idx = mul nsw i32 %i, 6
+; Make sure base is C[12].
+; CHECK-NEXT: %arrayidx.offs = add nsw i32 12, %arrayidx.idx
+; CHECK-NEXT: %arrayidx = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 %arrayidx.offs)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: ret double %2
+define noundef double @fooD3_C2_B0(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %arrayidx = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i
+  %0 = load double, ptr %arrayidx, align 2
+  ret double %0
+}
+
+; Make sure indexing D3.C2.B1 from C[13].xyz
+; CHECK: <3 x float> @fooD3_C2_B1(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; Make sure struct size is 6 x 4dwords.
+; CHECK-NEXT: %B1.idx = mul nsw i32 %i, 6
+; Make sure base is C[12+1]
+; CHECK-NEXT: %B1.offs = add nsw i32 12, %B1.idx
+; CHECK-NEXT: %B1.offs1 = add nsw i32 %B1.offs, 1
+; CHECK-NEXT: %B1 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 1
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 %B1.offs1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: %3 = insertelement <3 x float> poison, float %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f32 %1, 1
+; CHECK-NEXT: %5 = insertelement <3 x float> %3, float %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f32 %1, 2
+; CHECK-NEXT: %7 = insertelement <3 x float> %5, float %6, i64 2
+; CHECK-NEXT: ret <3 x float> %7
+define noundef <3 x float> @fooD3_C2_B1(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %B1 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 1
+  %0 = load <3 x float>, ptr %B1, align 2
+  ret <3 x float> %0
+}
+
+; Make sure indexing D3.C2.B2 from C[13].w
+; CHECK: float @fooD3_C2_B2(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; Make sure struct size is 6 x 4dwords.
+; CHECK-NEXT: %B2.idx = mul nsw i32 %i, 6
+; Make sure base is C[12+1]
+; CHECK-NEXT: %B2.offs = add nsw i32 12, %B2.idx
+; CHECK-NEXT: %B2.offs1 = add nsw i32 %B2.offs, 1
+; CHECK-NEXT: %B2 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 2
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 %B2.offs1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 3
+; CHECK-NEXT: ret float %2
+define noundef float @fooD3_C2_B2(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %B2 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 2
+  %0 = load float, ptr %B2, align 2
+  ret float %0
+}
+
+; Make sure indexing C3.C2.B3 from C[14].xyzw + C[15].xy
+; CHECK: <3 x double> @fooD3_C2_B3(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; Make sure struct size is 6 x 4dwords.
+; CHECK-NEXT: %B3.idx = mul nsw i32 %i, 6
+; Make sure base is C[12 + 2]
+; CHECK-NEXT: %B3.offs = add nsw i32 12, %B3.idx
+; CHECK-NEXT: %B3.offs1 = add nsw i32 %B3.offs, 2
+; CHECK-NEXT: %B3 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 3
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 %B3.offs1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: %3 = insertelement <3 x double> poison, double %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: %5 = insertelement <3 x double> %3, double %4, i64 1
+; Access C[15].xy.
+; CHECK-NEXT: %6 = add i32 %B3.offs1, 1
+; CHECK-NEXT: %7 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 %6)
+; CHECK-NEXT: %8 = extractvalue %dx.types.CBufRet.f64 %7, 0
+; CHECK-NEXT: %9 = insertelement <3 x double> %5, double %8, i64 2
+; CHECK-NEXT: ret <3 x double> %9
+define noundef <3 x double> @fooD3_C2_B3(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %B3 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 3
+  %0 = load <3 x double>, ptr %B3, align 2
+  ret <3 x double> %0
+}
+
+; Make sure indexing D3.C2.B4 from low 16bit of C[15].z
+; CHECK: half @fooD3_C2_B4(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; Make sure struct size is 6 x 4dwords.
+; CHECK-NEXT: %B4.idx = mul nsw i32 %i, 6
+; Make sure base is C[12+3]
+; CHECK-NEXT: %B4.offs = add nsw i32 12, %B4.idx
+; CHECK-NEXT: %B4.offs1 = add nsw i32 %B4.offs, 3
+; CHECK-NEXT: %B4 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 4
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 %B4.offs1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 4
+; CHECK-NEXT: ret half %2
+define noundef half @fooD3_C2_B4(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %B4 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 4
+  %0 = load half, ptr %B4, align 2
+  ret half %0
+}
+
+; Make sure D3.C2.B5 indexing from C[16].xyzw
+; CHECK: <2 x double> @fooD3_C2_B5(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; Make sure struct size is 6 x 4dwords.
+; CHECK-NEXT: %B5.idx = mul nsw i32 %i, 6
+; Make sure base is C[12+4]
+; CHECK-NEXT: %B5.offs = add nsw i32 12, %B5.idx
+; CHECK-NEXT: %B5.offs1 = add nsw i32 %B5.offs, 4
+; CHECK-NEXT: %B5 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 5
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %0, i32 %B5.offs1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f64 %1, 0
+; CHECK-NEXT: %3 = insertelement <2 x double> poison, double %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f64 %1, 1
+; CHECK-NEXT: %5 = insertelement <2 x double> %3, double %4, i64 1
+; CHECK-NEXT: ret <2 x double> %5
+define noundef <2 x double> @fooD3_C2_B5(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %B5 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 5
+  %0 = load <2 x double>, ptr %B5, align 2
+  ret <2 x double> %0
+}
+
+; Make sure D3.C2.B6 indexing from C[17].x
+; CHECK: float @fooD3_C2_B6(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; Make sure struct size is 6 x 4dwords.
+; CHECK-NEXT: %B6.idx = mul nsw i32 %i, 6
+; Make sure base is C[12 + 5]
+; CHECK-NEXT: %B6.offs = add nsw i32 12, %B6.idx
+; CHECK-NEXT: %B6.offs1 = add nsw i32 %B6.offs, 5
+; CHECK-NEXT: %B6 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 6
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %0, i32 %B6.offs1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f32 %1, 0
+; CHECK-NEXT: ret float %2
+define noundef float @fooD3_C2_B6(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %B6 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 6
+  %0 = load float, ptr %B6, align 2
+  ret float %0
+}
+
+; Make sure D3.C2.B7 indexing from C[17].y and low 16bit of C[17].z
+; CHECK: <3 x half> @fooD3_C2_B7(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; Make sure struct size is 6 x 4dwords.
+; CHECK-NEXT: %B7.idx = mul nsw i32 %i, 6
+; Make sure base is C[12+5]
+; CHECK-NEXT: %B7.offs = add nsw i32 12, %B7.idx
+; CHECK-NEXT: %B7.offs1 = add nsw i32 %B7.offs, 5
+; CHECK-NEXT: %B7 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 7
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 %B7.offs1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 2
+; CHECK-NEXT: %3 = insertelement <3 x half> poison, half %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f16.8 %1, 3
+; CHECK-NEXT: %5 = insertelement <3 x half> %3, half %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f16.8 %1, 4
+; CHECK-NEXT: %7 = insertelement <3 x half> %5, half %6, i64 2
+; CHECK-NEXT: ret <3 x half> %7
+define noundef <3 x half> @fooD3_C2_B7(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %B7 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 7
+  %0 = load <3 x half>, ptr %B7, align 2
+  ret <3 x half> %0
+}
+
+; Make sure D3.C2.B8 indexing from high 16bit of C[17].z and C[17].w
+; CHECK: <3 x half> @fooD3_C2_B8(i32 noundef %i)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 -1, i1 false)
+; CHECK-NEXT: %B8.idx = mul nsw i32 %i, 6
+; CHECK-NEXT: %B8.offs = add nsw i32 12, %B8.idx
+; CHECK-NEXT: %B8.offs1 = add nsw i32 %B8.offs, 5
+; CHECK-NEXT: %B8 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 8
+; CHECK-NEXT: %1 = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %0, i32 %B8.offs1)
+; CHECK-NEXT: %2 = extractvalue %dx.types.CBufRet.f16.8 %1, 5
+; CHECK-NEXT: %3 = insertelement <3 x half> poison, half %2, i64 0
+; CHECK-NEXT: %4 = extractvalue %dx.types.CBufRet.f16.8 %1, 6
+; CHECK-NEXT: %5 = insertelement <3 x half> %3, half %4, i64 1
+; CHECK-NEXT: %6 = extractvalue %dx.types.CBufRet.f16.8 %1, 7
+; CHECK-NEXT: %7 = insertelement <3 x half> %5, half %6, i64 2
+; CHECK-NEXT: ret <3 x half> %7
+define noundef <3 x half> @fooD3_C2_B8(i32 noundef %i) local_unnamed_addr  {
+entry:
+  %B8 = getelementptr inbounds { i32, %struct.B, half, %struct.C, double }, ptr @D.cb., i32 0, i32 3, i32 2, i32 %i, i32 8
+  %0 = load <3 x half>, ptr %B8, align 2
+  ret <3 x half> %0
+}
+
+
+!hlsl.cbufs = !{!0}
+!0 = !{ptr @D.cb., !"D.cb.ty", i32 0, i32 -1, i32 0}