diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 496fe094..d61d8069 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -6,9 +6,9 @@ name: CI on: # Triggers the workflow on push or pull request events but only for the main branch push: - branches: [ master, ci-test ] + branches: [ master, chi-coupledl2 ] pull_request: - branches: [ master, ci-test ] + branches: [ master, chi-coupledl2 ] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: @@ -47,8 +47,8 @@ jobs: - name: Compile run: make compile - - - name: Unit test + + - name: Unit test for TileLink version run: | git clone https://github.com/OpenXiangShan/tl-test -b coupledL2-huancun make test-top-l2l3l2 @@ -57,3 +57,7 @@ jobs: cmake .. -DDUT_DIR=../../build -DCHISELDB=1 make ./tlc_test -s $RANDOM + + - name: Compile CHI QuadCore + run: | + make test-top-chi-quadcore-2ul diff --git a/Makefile b/Makefile index 177701be..88c61eff 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,30 @@ test-top-l2l3l2: test-top-fullsys: mill -i CoupledL2.test.runMain coupledL2.TestTop_fullSys -td build +test-top-chi-dualcore-0ul: + mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_DualCore_0UL -td build + +test-top-chi-dualcore-2ul: + mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_DualCore_2UL -td build + +test-top-chi-quadcore-0ul: + mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_QuadCore_0UL -td build + +test-top-chi-quadcore-2ul: + mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_QuadCore_2UL -td build + +test-top-chi-octacore-0ul: + mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_OctaCore_0UL -td build + +test-top-chi-octacore-2ul: + mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_OctaCore_2UL -td build + +test-top-chi-hexacore-0ul: + mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_HexaCore_0UL -td build + +test-top-chi-hexacore-2ul: + mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_HexaCore_2UL -td build + clean: rm -rf ./build diff --git a/src/main/scala/coupledL2/Common.scala b/src/main/scala/coupledL2/Common.scala index 48d5318c..59dafa8b 100644 --- a/src/main/scala/coupledL2/Common.scala +++ b/src/main/scala/coupledL2/Common.scala @@ -22,6 +22,7 @@ import chisel3.util._ import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.tilelink.TLPermissions._ import utility.MemReqSource +import tl2chi.{HasCHIMsgParameters, HasCHIChannelBits, CHIREQ, MemAttr, OrderEncodings} abstract class L2Module(implicit val p: Parameters) extends Module with HasCoupledL2Parameters abstract class L2Bundle(implicit val p: Parameters) extends Bundle with HasCoupledL2Parameters @@ -32,7 +33,7 @@ class ReplacerInfo(implicit p: Parameters) extends L2Bundle { val reqSource = UInt(MemReqSource.reqSourceBits.W) } -trait HasChannelBits { this: Bundle => +trait HasTLChannelBits { this: Bundle => val channel = UInt(3.W) def fromA = channel(0).asBool def fromB = channel(1).asBool @@ -52,7 +53,10 @@ class MergeTaskBundle(implicit p: Parameters) extends L2Bundle { // We generate a Task for every TL request // this is the info that flows in Mainpipe -class TaskBundle(implicit p: Parameters) extends L2Bundle with HasChannelBits { +class TaskBundle(implicit p: Parameters) extends L2Bundle + with HasTLChannelBits + with HasCHIMsgParameters + with HasCHIChannelBits { val set = UInt(setBits.W) val tag = UInt(tagBits.W) val off = UInt(offsetBits.W) @@ -104,9 +108,47 @@ class TaskBundle(implicit p: Parameters) extends L2Bundle with HasChannelBits { // for merged MSHR tasks(Acquire & late Prefetch) val mergeA = Bool() val aMergeTask = new MergeTaskBundle() + + // Used for get data from ReleaseBuf when snoop hit with same PA + val snpHitRelease = Bool() + val snpHitReleaseWithData = Bool() + val snpHitReleaseIdx = UInt(mshrBits.W) + // CHI + val tgtID = chiOpt.map(_ => UInt(TGTID_WIDTH.W)) + val srcID = chiOpt.map(_ => UInt(SRCID_WIDTH.W)) + val txnID = chiOpt.map(_ => UInt(TXNID_WIDTH.W)) + val homeNID = chiOpt.map(_ => UInt(SRCID_WIDTH.W)) + val dbID = chiOpt.map(_ => UInt(DBID_WIDTH.W)) + val fwdNID = chiOpt.map(_ => UInt(FWDNID_WIDTH.W)) + val fwdTxnID = chiOpt.map(_ => UInt(FWDTXNID_WIDTH.W)) + val chiOpcode = chiOpt.map(_ => UInt(OPCODE_WIDTH.W)) + val resp = chiOpt.map(_ => UInt(RESP_WIDTH.W)) + val fwdState = chiOpt.map(_ => UInt(FWDSTATE_WIDTH.W)) + val pCrdType = chiOpt.map(_ => UInt(PCRDTYPE_WIDTH.W)) + val retToSrc = chiOpt.map(_ => Bool()) // only used in snoop + val expCompAck = chiOpt.map(_ => Bool()) + val allowRetry = chiOpt.map(_ => Bool()) + val memAttr = chiOpt.map(_ => new MemAttr) + + def toCHIREQBundle(): CHIREQ = { + val req = WireInit(0.U.asTypeOf(new CHIREQ())) + req.tgtID := tgtID.getOrElse(0.U) + req.srcID := srcID.getOrElse(0.U) + req.txnID := txnID.getOrElse(0.U) + req.opcode := chiOpcode.getOrElse(0.U) + req.addr := Cat(tag, set, 0.U(offsetBits.W)) + req.allowRetry := allowRetry.getOrElse(true.B) //TODO: consider retry + req.pCrdType := pCrdType.getOrElse(0.U) + req.expCompAck := expCompAck.getOrElse(false.B) + req.memAttr := memAttr.getOrElse(MemAttr()) + req.snpAttr := true.B + req.order := OrderEncodings.None + req + } } -class PipeStatus(implicit p: Parameters) extends L2Bundle with HasChannelBits +class PipeStatus(implicit p: Parameters) extends L2Bundle + with HasTLChannelBits class PipeEntranceStatus(implicit p: Parameters) extends L2Bundle { val tags = Vec(4, UInt(tagBits.W)) @@ -123,34 +165,6 @@ class PipeEntranceStatus(implicit p: Parameters) extends L2Bundle { def g_set = sets(3) } -// MSHR exposes signals to MSHRCtl -class MSHRStatus(implicit p: Parameters) extends L2Bundle with HasChannelBits { - val set = UInt(setBits.W) - val reqTag = UInt(tagBits.W) - val metaTag = UInt(tagBits.W) - val needsRepl = Bool() - val w_c_resp = Bool() - val w_d_resp = Bool() - val will_free = Bool() - - // val way = UInt(wayBits.W) -// val off = UInt(offsetBits.W) -// val opcode = UInt(3.W) -// val param = UInt(3.W) -// val size = UInt(msgSizeBits.W) -// val source = UInt(sourceIdBits.W) -// val alias = aliasBitsOpt.map(_ => UInt(aliasBitsOpt.get.W)) -// val aliasTask = aliasBitsOpt.map(_ => Bool()) -// val needProbeAckData = Bool() // only for B reqs -// val fromL2pft = prefetchOpt.map(_ => Bool()) -// val needHint = prefetchOpt.map(_ => Bool()) - - // for TopDown usage - val reqSource = UInt(MemReqSource.reqSourceBits.W) - val is_miss = Bool() - val is_prefetch = Bool() -} - // MSHR Task that MainPipe sends to MSHRCtl class MSHRRequest(implicit p: Parameters) extends L2Bundle { val dirResult = new DirResult() @@ -159,11 +173,12 @@ class MSHRRequest(implicit p: Parameters) extends L2Bundle { } // MSHR info to ReqBuf and SinkB -class MSHRInfo(implicit p: Parameters) extends L2Bundle { +class MSHRInfo(implicit p: Parameters) extends L2Bundle with HasTLChannelBits { val set = UInt(setBits.W) val way = UInt(wayBits.W) val reqTag = UInt(tagBits.W) val willFree = Bool() + val aliasTask = aliasBitsOpt.map(_ => Bool()) // to block Acquire for to-be-replaced data until Release done (indicated by ReleaseAck received) val needRelease = Bool() @@ -172,28 +187,42 @@ class MSHRInfo(implicit p: Parameters) extends L2Bundle { val blockRefill = Bool() val metaTag = UInt(tagBits.W) + val metaState = UInt(stateBits.W) val dirHit = Bool() - // decide whether can nest B (req same-addr) - val nestB = Bool() - // to drop duplicate prefetch reqs val isAcqOrPrefetch = Bool() val isPrefetch = Bool() // whether the mshr_task already in mainpipe - val s_refill = Bool() val param = UInt(3.W) val mergeA = Bool() // whether the mshr already merge an acquire(avoid alias merge) + + val w_grantfirst = Bool() + val s_refill = Bool() val w_releaseack = Bool() + val w_replResp = Bool() + val w_rprobeacklast = Bool() + + val replaceData = Bool() // If there is a replace, WriteBackFull or Evict } -class RespInfoBundle(implicit p: Parameters) extends L2Bundle { +class RespInfoBundle(implicit p: Parameters) extends L2Bundle + with HasCHIMsgParameters +{ val opcode = UInt(3.W) val param = UInt(3.W) val last = Bool() // last beat val dirty = Bool() // only used for sinkD resps val isHit = Bool() // only used for sinkD resps + //CHI + val chiOpcode = chiOpt.map(_ => UInt(OPCODE_WIDTH.W)) + val txnID = chiOpt.map(_ => UInt(TXNID_WIDTH.W)) + val srcID = chiOpt.map(_ => UInt(SRCID_WIDTH.W)) + val homeNID = chiOpt.map(_ => UInt(SRCID_WIDTH.W)) + val dbID = chiOpt.map(_ => UInt(DBID_WIDTH.W)) + val resp = chiOpt.map(_ => UInt(RESP_WIDTH.W)) + val pCrdType = chiOpt.map(_ => UInt(PCRDTYPE_WIDTH.W)) } class RespBundle(implicit p: Parameters) extends L2Bundle { @@ -227,6 +256,12 @@ class FSMState(implicit p: Parameters) extends L2Bundle { val w_grant = Bool() val w_releaseack = Bool() val w_replResp = Bool() + + // CHI + val s_compack = chiOpt.map(_ => Bool()) + val s_cbwrdata = chiOpt.map(_ => Bool()) + val s_reissue = chiOpt.map(_ => Bool()) + val s_dct = chiOpt.map(_ => Bool()) } class SourceAReq(implicit p: Parameters) extends L2Bundle { @@ -260,7 +295,13 @@ class BlockInfo(implicit p: Parameters) extends L2Bundle { class NestedWriteback(implicit p: Parameters) extends L2Bundle { val set = UInt(setBits.W) val tag = UInt(tagBits.W) + // Nested ReleaseData sets block dirty val c_set_dirty = Bool() + // Nested Snoop invalidates block + val b_inv_dirty = Bool() + + val b_toB = chiOpt.map(_ => Bool()) + val b_toN = chiOpt.map(_ => Bool()) } class PrefetchRecv extends Bundle { diff --git a/src/main/scala/coupledL2/Consts.scala b/src/main/scala/coupledL2/Consts.scala index be3433c4..52ec8168 100644 --- a/src/main/scala/coupledL2/Consts.scala +++ b/src/main/scala/coupledL2/Consts.scala @@ -30,6 +30,11 @@ object MetaData { def TRUNK: UInt = 2.U(stateBits.W) // unique inner master cache is trunk def TIP: UInt = 3.U(stateBits.W) // we are trunk, inner masters are branch + def needB(opcode: UInt, param: UInt): Bool = { + opcode === TLMessages.Get || + opcode === TLMessages.AcquireBlock && param === TLPermissions.NtoB || + opcode === TLMessages.Hint && param === TLHints.PREFETCH_READ + } // Does a request need trunk to be handled? def needT(opcode: UInt, param: UInt): Bool = { !opcode(2) || @@ -64,4 +69,5 @@ object MetaData { Seq(INVALID, INVALID, BRANCH) ) } + def isValid(state: UInt): Bool = state > INVALID } \ No newline at end of file diff --git a/src/main/scala/coupledL2/CoupledL2.scala b/src/main/scala/coupledL2/CoupledL2.scala index 3408f9ff..787f1246 100644 --- a/src/main/scala/coupledL2/CoupledL2.scala +++ b/src/main/scala/coupledL2/CoupledL2.scala @@ -27,14 +27,16 @@ import freechips.rocketchip.tile.MaxHartIdBits import freechips.rocketchip.tilelink._ import freechips.rocketchip.tilelink.TLMessages._ import freechips.rocketchip.util._ -import org.chipsalliance.cde.config.Parameters +import org.chipsalliance.cde.config.{Parameters, Field} import scala.math.max import coupledL2.prefetch._ import coupledL2.utils.XSPerfAccumulate -import huancun.{TPmetaReq, TPmetaResp} +import huancun.{TPmetaReq, TPmetaResp, BankBitsKey} trait HasCoupledL2Parameters { val p: Parameters + // val tl2tlParams: HasTLL2Parameters = p(L2ParamKey) + val enableCHI = p(EnableCHI) val cacheParams = p(L2ParamKey) val XLEN = 64 @@ -48,6 +50,7 @@ trait HasCoupledL2Parameters { val offsetBits = log2Ceil(blockBytes) val beatBits = offsetBits - log2Ceil(beatBytes) val stateBits = MetaData.stateBits + val chiOpt = if (enableCHI) Some(true) else None val aliasBitsOpt = if(cacheParams.clientCaches.isEmpty) None else cacheParams.clientCaches.head.aliasBitsOpt // vaddr without offset bits @@ -65,6 +68,8 @@ trait HasCoupledL2Parameters { val releaseBufWPorts = 3 // sinkC & mainPipe s5 & mainPipe s3 (nested) + val mmioBridgeSize = cacheParams.mmioBridgeSize + // Prefetch val prefetchOpt = cacheParams.prefetch val hasPrefetchBit = prefetchOpt.nonEmpty && prefetchOpt.get.hasPrefetchBit @@ -97,7 +102,7 @@ trait HasCoupledL2Parameters { val grantBufInflightSize = mshrsAll //TODO: lack or excessive? !! WARNING // width params with bank idx (used in prefetcher / ctrl unit) - lazy val fullAddressBits = edgeOut.bundle.addressBits + lazy val fullAddressBits = edgeIn.bundle.addressBits lazy val fullTagBits = fullAddressBits - setBits - offsetBits // width params without bank idx (used in slice) lazy val addressBits = fullAddressBits - bankBits @@ -105,6 +110,8 @@ trait HasCoupledL2Parameters { lazy val outerSinkBits = edgeOut.bundle.sinkBits + val sam = cacheParams.sam + def getClientBitOH(sourceId: UInt): UInt = { if (clientBits == 0) { 0.U @@ -147,10 +154,31 @@ trait HasCoupledL2Parameters { (tag(tagBits - 1, 0), set(setBits - 1, 0), offset(offsetBits - 1, 0)) } + def restoreAddress(x: UInt, idx: Int) = { + restoreAddressUInt(x, idx.U) + } + + def restoreAddressUInt(x: UInt, idx: UInt) = { + if(bankBits == 0){ + x + } else { + val high = x >> offsetBits + val low = x(offsetBits - 1, 0) + Cat(high, idx(bankBits - 1, 0), low) + } + } + def getPPN(x: UInt): UInt = { x(x.getWidth - 1, pageOffsetBits) } + def arb[T <: Bundle](in: Seq[DecoupledIO[T]], out: DecoupledIO[T], name: Option[String] = None): Unit = { + val arb = Module(new Arbiter[T](chiselTypeOf(out.bits), in.size)) + if (name.nonEmpty) { arb.suggestName(s"${name.get}_arb") } + for ((a, req) <- arb.io.in.zip(in)) { a <> req } + out <> arb.io.out + } + def fastArb[T <: Bundle](in: Seq[DecoupledIO[T]], out: DecoupledIO[T], name: Option[String] = None): Unit = { val arb = Module(new FastArbiter[T](chiselTypeOf(out.bits), in.size)) if (name.nonEmpty) { arb.suggestName(s"${name.get}_arb") } @@ -164,341 +192,29 @@ trait HasCoupledL2Parameters { val opToA = VecInit(opSeq)(r) opToA } + + def sizeBytesToStr(sizeBytes: Double): String = sizeBytes match { + case _ if sizeBytes >= 1024 * 1024 => (sizeBytes / 1024 / 1024) + "MB" + case _ if sizeBytes >= 1024 => (sizeBytes / 1024) + "KB" + case _ => "B" + } + + def print_bundle_fields(fs: Seq[BundleFieldBase], prefix: String) = { + if(fs.nonEmpty){ + println(fs.map{f => s"$prefix/${f.key.name}: (${f.data.getWidth}-bit)"}.mkString("\n")) + } + } } -class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Parameters { +abstract class CoupledL2Base(implicit p: Parameters) extends LazyModule with HasCoupledL2Parameters { val xfer = TransferSizes(blockBytes, blockBytes) val atom = TransferSizes(1, cacheParams.channelBytes.d.get) val access = TransferSizes(1, blockBytes) - val clientPortParams = (m: TLMasterPortParameters) => TLMasterPortParameters.v2( - Seq( - TLMasterParameters.v2( - name = cacheParams.name, - supports = TLSlaveToMasterTransferSizes( - probe = xfer - ), - sourceId = IdRange(0, idsAll) - ) - ), - channelBytes = cacheParams.channelBytes, - minLatency = 1, - echoFields = cacheParams.echoField, - requestFields = cacheParams.reqField, - responseKeys = cacheParams.respKey - ) - - val managerPortParams = (m: TLSlavePortParameters) => TLSlavePortParameters.v1( - m.managers.map { m => - m.v2copy( - regionType = if (m.regionType >= RegionType.UNCACHED) RegionType.CACHED else m.regionType, - supports = TLMasterToSlaveTransferSizes( - acquireB = xfer, - acquireT = if (m.supportsAcquireT) xfer else TransferSizes.none, - arithmetic = if (m.supportsAcquireT) atom else TransferSizes.none, - logical = if (m.supportsAcquireT) atom else TransferSizes.none, - get = access, - putFull = if (m.supportsAcquireT) access else TransferSizes.none, - putPartial = if (m.supportsAcquireT) access else TransferSizes.none, - hint = access - ), - fifoId = None - ) - }, - beatBytes = 32, - minLatency = 2, - responseFields = cacheParams.respField, - requestKeys = cacheParams.reqKey, - endSinkId = idsAll - ) - - val node = TLAdapterNode( - clientFn = clientPortParams, - managerFn = managerPortParams - ) - val pf_recv_node: Option[BundleBridgeSink[PrefetchRecv]] = prefetchOpt match { case Some(_: PrefetchReceiverParams) => Some(BundleBridgeSink(Some(() => new PrefetchRecv))) case _ => None } - val tpmeta_source_node = prefetchOpt match { - case Some(param: PrefetchReceiverParams) => - if (param.hasTPPrefetcher) Some(BundleBridgeSource(() => DecoupledIO(new TPmetaReq))) else None - case _ => None - } - val tpmeta_sink_node = prefetchOpt match { - case Some(param: PrefetchReceiverParams) => - if (param.hasTPPrefetcher) Some(BundleBridgeSink(Some(() => ValidIO(new TPmetaResp)))) else None - case _ => None - } - - class CoupledL2Imp(wrapper: LazyModule) extends LazyModuleImp(wrapper) { - val banks = node.in.size - val bankBits = if (banks == 1) 0 else log2Up(banks) - val l2TlbParams: Parameters = p.alterPartial { - case EdgeInKey => node.in.head._2 - case EdgeOutKey => node.out.head._2 - case BankBitsKey => bankBits - } - val io = IO(new Bundle { - val hartId = Input(UInt(hartIdLen.W)) - val l2_hint = ValidIO(new L2ToL1Hint()) - val l2_tlb_req = new L2ToL1TlbIO(nRespDups = 1)(l2TlbParams) - val debugTopDown = new Bundle { - val robTrueCommit = Input(UInt(64.W)) - val robHeadPaddr = Flipped(Valid(UInt(36.W))) - val l2MissMatch = Output(Bool()) - } - }) - - // Display info - val sizeBytes = cacheParams.toCacheParams.capacity.toDouble - def sizeBytesToStr(sizeBytes: Double): String = sizeBytes match { - case _ if sizeBytes >= 1024 * 1024 => (sizeBytes / 1024 / 1024) + "MB" - case _ if sizeBytes >= 1024 => (sizeBytes / 1024) + "KB" - case _ => "B" - } - val sizeStr = sizeBytesToStr(sizeBytes) - val prefetch = "prefetch: " + cacheParams.prefetch - println(s"====== Inclusive ${cacheParams.name} ($sizeStr * $banks-bank) $prefetch ======") - println(s"bankBits: ${bankBits}") - println(s"replacement: ${cacheParams.replacement}") - println(s"replace policy: ${cacheParams.releaseData}") - println(s"sets:${cacheParams.sets} ways:${cacheParams.ways} blockBytes:${cacheParams.blockBytes}") - def print_bundle_fields(fs: Seq[BundleFieldBase], prefix: String) = { - if(fs.nonEmpty){ - println(fs.map{f => s"$prefix/${f.key.name}: (${f.data.getWidth}-bit)"}.mkString("\n")) - } - } - print_bundle_fields(node.in.head._2.bundle.requestFields, "usr") - print_bundle_fields(node.in.head._2.bundle.echoFields, "echo") - - node.edges.in.headOption.foreach { n => - n.client.clients.zipWithIndex.foreach { - case (c, i) => - println(s"\t${i} <= ${c.name};" + - s"\tsourceRange: ${c.sourceId.start}~${c.sourceId.end}") - } - } - - // connection between prefetcher and the slices - val pftParams: Parameters = p.alterPartial { - case EdgeInKey => node.in.head._2 - case EdgeOutKey => node.out.head._2 - case BankBitsKey => bankBits - } - val prefetcher = prefetchOpt.map(_ => Module(new Prefetcher()(pftParams))) - val prefetchTrains = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchTrain()(pftParams))))) - val prefetchResps = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchResp()(pftParams))))) - val prefetchReqsReady = WireInit(VecInit(Seq.fill(banks)(false.B))) - io.l2_tlb_req <> DontCare - prefetchOpt.foreach { - _ => - fastArb(prefetchTrains.get, prefetcher.get.io.train, Some("prefetch_train")) - prefetcher.get.io.req.ready := Cat(prefetchReqsReady).orR - prefetcher.get.hartId := io.hartId - fastArb(prefetchResps.get, prefetcher.get.io.resp, Some("prefetch_resp")) - prefetcher.get.io.tlb_req <> io.l2_tlb_req - } - pf_recv_node match { - case Some(x) => - prefetcher.get.io.recv_addr.valid := x.in.head._1.addr_valid - prefetcher.get.io.recv_addr.bits.addr := x.in.head._1.addr - prefetcher.get.io.recv_addr.bits.pfSource := x.in.head._1.pf_source - prefetcher.get.io_l2_pf_en := x.in.head._1.l2_pf_en - case None => - prefetcher.foreach{ - p => - p.io.recv_addr := 0.U.asTypeOf(p.io.recv_addr) - p.io_l2_pf_en := false.B - } - } - - tpmeta_source_node match { - case Some(x) => - x.out.head._1 <> prefetcher.get.tpio.tpmeta_port.get.req - case None => - } - tpmeta_sink_node match { - case Some(x) => - prefetcher.get.tpio.tpmeta_port.get.resp <> x.in.head._1 - case None => - } - - def restoreAddress(x: UInt, idx: Int) = { - restoreAddressUInt(x, idx.U) - } - def restoreAddressUInt(x: UInt, idx: UInt) = { - if(bankBits == 0){ - x - } else { - val high = x >> offsetBits - val low = x(offsetBits - 1, 0) - Cat(high, idx(bankBits - 1, 0), low) - } - } - def bank_eq(set: UInt, bankId: Int, bankBits: Int): Bool = { - if(bankBits == 0) true.B else set(bankBits - 1, 0) === bankId.U - } - - // ** WARNING:TODO: this depends on where the latch is - // ** if Hint latched in slice, while D-Channel latched in XSTile - // ** we need only [hintCycleAhead - 1] later - val sliceAhead = hintCycleAhead - 1 - - val hintChosen = Wire(UInt(banks.W)) - val hintFire = Wire(Bool()) - - // if Hint indicates that this slice should fireD, yet no D resp comes out of this slice - // then we releaseSourceD, enabling io.d.ready for other slices - // TODO: if Hint for single slice is 100% accurate, may consider remove this - val releaseSourceD = Wire(Vec(banks, Bool())) - val allCanFire = (RegNextN(!hintFire, sliceAhead) && RegNextN(!hintFire, sliceAhead + 1)) || Cat(releaseSourceD).orR - - val slices = node.in.zip(node.out).zipWithIndex.map { - case (((in, edgeIn), (out, edgeOut)), i) => - require(in.params.dataBits == out.params.dataBits) - val rst_L2 = reset - val slice = withReset(rst_L2) { - Module(new Slice()(p.alterPartial { - case EdgeInKey => edgeIn - case EdgeOutKey => edgeOut - case BankBitsKey => bankBits - case SliceIdKey => i - })) - } - slice.io.in <> in - if (enableHintGuidedGrant) { - // If the hint of slice X is selected at cycle T, then at cycle (T + 3) & (T + 4) - // we will try our best to select the grant of slice X. - // If slice X has no grant then, it means that the hint at cycle T is wrong, - // so we relax the restriction on grant selection. - val sliceCanFire = RegNextN(hintFire && i.U === hintChosen, sliceAhead) || - RegNextN(hintFire && i.U === hintChosen, sliceAhead + 1) - - releaseSourceD(i) := sliceCanFire && !slice.io.in.d.valid - - in.d.valid := slice.io.in.d.valid && (sliceCanFire || allCanFire) - slice.io.in.d.ready := in.d.ready && (sliceCanFire || allCanFire) - } - - in.b.bits.address := restoreAddress(slice.io.in.b.bits.address, i) - out <> slice.io.out - out.a.bits.address := restoreAddress(slice.io.out.a.bits.address, i) - out.c.bits.address := restoreAddress(slice.io.out.c.bits.address, i) - slice.io.sliceId := i.U - - slice.io.prefetch.zip(prefetcher).foreach { - case (s, p) => - s.req.valid := p.io.req.valid && bank_eq(p.io.req.bits.set, i, bankBits) - s.req.bits := p.io.req.bits - prefetchReqsReady(i) := s.req.ready && bank_eq(p.io.req.bits.set, i, bankBits) - val train = Pipeline(s.train) - val resp = Pipeline(s.resp) - prefetchTrains.get(i) <> train - prefetchResps.get(i) <> resp - // restore to full address - if(bankBits != 0){ - val train_full_addr = Cat( - train.bits.tag, train.bits.set, i.U(bankBits.W), 0.U(offsetBits.W) - ) - val (train_tag, train_set, _) = s.parseFullAddress(train_full_addr) - val resp_full_addr = Cat( - resp.bits.tag, resp.bits.set, i.U(bankBits.W), 0.U(offsetBits.W) - ) - val (resp_tag, resp_set, _) = s.parseFullAddress(resp_full_addr) - prefetchTrains.get(i).bits.tag := train_tag - prefetchTrains.get(i).bits.set := train_set - prefetchResps.get(i).bits.tag := resp_tag - prefetchResps.get(i).bits.set := resp_set - } - s.tlb_req.req.valid := false.B - s.tlb_req.req.bits := DontCare - s.tlb_req.req_kill := DontCare - s.tlb_req.resp.ready := true.B - } - - slice - } - - if(enableHintGuidedGrant) { - // for timing consideration, hint should latch one cycle before sending to L1 - // instead of adding a Pipeline/Queue to latch here, we just set hintQueue in GrantBuf & CustomL1Hint "flow=false" - val l1HintArb = Module(new Arbiter(new L2ToL1Hint(), slices.size)) - val slices_l1Hint = slices.zipWithIndex.map { - case (s, i) => s.io.l1Hint - } - // should only Hint for DCache - val (sourceIsDcache, dcacheSourceIdStart) = node.in.head._2.client.clients - .filter(_.supports.probe) - .map(c => { - (c.sourceId.contains(l1HintArb.io.out.bits.sourceId).asInstanceOf[Bool], c.sourceId.start.U) - }).head - - l1HintArb.io.in <> VecInit(slices_l1Hint) - io.l2_hint.valid := l1HintArb.io.out.fire && sourceIsDcache - io.l2_hint.bits.sourceId := l1HintArb.io.out.bits.sourceId - dcacheSourceIdStart - io.l2_hint.bits.isKeyword := l1HintArb.io.out.bits.isKeyword - // continuous hints can only be sent every two cycle, since GrantData takes two cycles - l1HintArb.io.out.ready := !RegNext(io.l2_hint.valid, false.B) - - hintChosen := l1HintArb.io.chosen // ! THIS IS NOT ONE-HOT ! - hintFire := io.l2_hint.valid - } - - // ==================== TopDown ==================== - val topDown = topDownOpt.map(_ => Module(new TopDownMonitor()(p.alterPartial { - case EdgeInKey => node.in.head._2 - case EdgeOutKey => node.out.head._2 - case BankBitsKey => bankBits - }))) - topDown match { - case Some(t) => - t.io.msStatus.zip(slices).foreach { - case (in, s) => in := s.io.msStatus.get - } - t.io.dirResult.zip(slices).foreach { - case (res, s) => res := s.io.dirResult.get - } - t.io.latePF.zip(slices).foreach { - case (in, s) => in := s.io.latePF.get - } - t.io.debugTopDown <> io.debugTopDown - case None => io.debugTopDown.l2MissMatch := false.B - } - - // ==================== XSPerf Counters ==================== - val grant_data_fire = slices.map { slice => { - val (first, _, _, _) = node.in.head._2.count(slice.io.in.d) - slice.io.in.d.fire && first && slice.io.in.d.bits.opcode === GrantData - } - } - XSPerfAccumulate(cacheParams, "grant_data_fire", PopCount(VecInit(grant_data_fire))) - - val hint_source = io.l2_hint.bits.sourceId - - val grant_data_source = ParallelPriorityMux(slices.map { - s => (s.io.in.d.fire, s.io.in.d.bits.source) - }) - - val hintPipe2 = Module(new Pipeline(UInt(32.W), 2)) - hintPipe2.io.in.valid := io.l2_hint.valid - hintPipe2.io.in.bits := hint_source - hintPipe2.io.out.ready := true.B - - val hintPipe1 = Module(new Pipeline(UInt(32.W), 1)) - hintPipe1.io.in.valid := io.l2_hint.valid - hintPipe1.io.in.bits := hint_source - hintPipe1.io.out.ready := true.B - - val accurateHint = grant_data_fire.orR && hintPipe2.io.out.valid && hintPipe2.io.out.bits === grant_data_source - XSPerfAccumulate(cacheParams, "accurate3Hints", accurateHint) - - val okHint = grant_data_fire.orR && hintPipe1.io.out.valid && hintPipe1.io.out.bits === grant_data_source - XSPerfAccumulate(cacheParams, "ok2Hints", okHint) - } - - lazy val module = new CoupledL2Imp(this) } diff --git a/src/main/scala/coupledL2/DataStorage.scala b/src/main/scala/coupledL2/DataStorage.scala index b50f15e7..6a68f4a6 100644 --- a/src/main/scala/coupledL2/DataStorage.scala +++ b/src/main/scala/coupledL2/DataStorage.scala @@ -51,7 +51,8 @@ class DataStorage(implicit p: Parameters) extends L2Module { gen = new DSBlock, set = blocks, way = 1, - singlePort = true + singlePort = true, + holdRead = true )) val arrayIdx = Cat(io.req.bits.way, io.req.bits.set) @@ -60,7 +61,10 @@ class DataStorage(implicit p: Parameters) extends L2Module { array.io.w.apply(wen, io.wdata, arrayIdx, 1.U) array.io.r.apply(ren, arrayIdx) - // TODO: timing: we should not use reg here, instead set this as multicycle path + // for timing, we set this as multicycle path // s3 read, s4 pass and s5 to destination - io.rdata := RegNextN(array.io.r.resp.data(0), 1) + io.rdata := array.io.r.resp.data(0) + + assert(!io.req.valid || !RegNext(io.req.valid, false.B), + "Continuous SRAM req prohibited under MCP2!") } diff --git a/src/main/scala/coupledL2/GrantBuffer.scala b/src/main/scala/coupledL2/GrantBuffer.scala index 97b4d66a..8d14e79d 100644 --- a/src/main/scala/coupledL2/GrantBuffer.scala +++ b/src/main/scala/coupledL2/GrantBuffer.scala @@ -109,8 +109,9 @@ class GrantBuffer(implicit p: Parameters) extends L2Module { })) val dtaskOpcode = io.d_task.bits.task.opcode - val mergeAtask = Wire(new TaskBundle()) + val mergeAtask = WireInit(0.U.asTypeOf(new TaskBundle())) mergeAtask.channel := io.d_task.bits.task.channel + mergeAtask.txChannel := io.d_task.bits.task.txChannel mergeAtask.off := io.d_task.bits.task.aMergeTask.off mergeAtask.alias.foreach(_ := io.d_task.bits.task.aMergeTask.alias.getOrElse(0.U)) mergeAtask.opcode := io.d_task.bits.task.aMergeTask.opcode @@ -275,10 +276,16 @@ class GrantBuffer(implicit p: Parameters) extends L2Module { val noSpaceForSinkReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s => s.valid && (s.bits.fromA || s.bits.fromC) }).asUInt) + grantQueueCnt >= mshrsAll.U + val noSpaceWaitSinkEForSinkReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s => + s.valid && s.bits.fromA + }).asUInt) + PopCount(VecInit(inflightGrant.map(x => x.valid))) >= mshrsAll.U // for timing consideration, drop s1 info, so always reserve one entry for it val noSpaceForMSHRReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s => s.valid && (s.bits.fromA || s.bits.fromC) }).asUInt) + grantQueueCnt >= (mshrsAll-1).U + val noSpaceWaitSinkEForMSHRReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s => + s.valid && s.bits.fromA + }).asUInt) + PopCount(VecInit(inflightGrant.map(x => x.valid))) >= (mshrsAll - 1).U // pftRespQueue also requires back pressure to ensure that it will not exceed capacity // Ideally, it should only block Prefetch from entering MainPipe // But since it is extremely rare that pftRespQueue of 10 would be full, we just block all Entrance here, simpler logic @@ -290,14 +297,14 @@ class GrantBuffer(implicit p: Parameters) extends L2Module { s.valid && s.bits.fromA }).asUInt) + pftRespQueue.get.io.count >= (pftQueueLen-1).U) - io.toReqArb.blockSinkReqEntrance.blockA_s1 := noSpaceForSinkReq || noSpaceForSinkPft.getOrElse(false.B) + io.toReqArb.blockSinkReqEntrance.blockA_s1 := noSpaceForSinkReq || noSpaceWaitSinkEForSinkReq || noSpaceForSinkPft.getOrElse(false.B) io.toReqArb.blockSinkReqEntrance.blockB_s1 := Cat(inflightGrant.map(g => g.valid && g.bits.set === io.fromReqArb.status_s1.b_set && g.bits.tag === io.fromReqArb.status_s1.b_tag)).orR //TODO: or should we still Stall B req? // A-replace related rprobe is handled in SourceB io.toReqArb.blockSinkReqEntrance.blockC_s1 := noSpaceForSinkReq io.toReqArb.blockSinkReqEntrance.blockG_s1 := false.B // this is not used - io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq || noSpaceForMSHRPft.getOrElse(false.B) + io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq || noSpaceWaitSinkEForMSHRReq || noSpaceForMSHRPft.getOrElse(false.B) // =========== XSPerf =========== if (cacheParams.enablePerf) { diff --git a/src/main/scala/coupledL2/L2Param.scala b/src/main/scala/coupledL2/L2Param.scala index 5e0249fc..ae49383d 100644 --- a/src/main/scala/coupledL2/L2Param.scala +++ b/src/main/scala/coupledL2/L2Param.scala @@ -19,7 +19,7 @@ package coupledL2 import chisel3._ import chisel3.util.log2Ceil -import freechips.rocketchip.diplomacy.BufferParams +import freechips.rocketchip.diplomacy.{BufferParams, AddressSet} import freechips.rocketchip.tilelink._ import freechips.rocketchip.util._ import org.chipsalliance.cde.config.Field @@ -27,8 +27,7 @@ import huancun.{AliasKey, CacheParameters, IsHitKey, PrefetchKey} import coupledL2.prefetch._ import utility.{MemReqSource, ReqSourceKey} -// General parameter key of CoupledL2 -case object L2ParamKey extends Field[L2Param](L2Param()) +case object EnableCHI extends Field[Boolean](false) // L1 Cache Params, used for TestTop generation case class L1Param @@ -55,8 +54,7 @@ case class VaddrField(width: Int) extends BundleField[UInt](VaddrKey, Output(UIn case object IsKeywordKey extends ControlKey[Bool]("isKeyword") case class IsKeywordField() extends BundleField[Bool](IsKeywordKey, Output(Bool()), _ := false.B) -case class L2Param -( +case class L2Param( name: String = "L2", ways: Int = 4, sets: Int = 128, @@ -72,8 +70,9 @@ case class L2Param * 2 for all except prefetch & !accessed * 3 for all */ + mmioBridgeSize: Int = 8, - // Client (these are set in Configs.scala in XiangShan) + // Client echoField: Seq[BundleFieldBase] = Nil, reqField: Seq[BundleFieldBase] = Nil, respKey: Seq[BundleKeyBase] = Seq(IsHitKey), @@ -99,10 +98,15 @@ case class L2Param enableRollingDB: Boolean = true, // Monitor enableMonitor: Boolean = true, + // TLLog + enableTLLog: Boolean = true, // TopDown elaboratedTopDown: Boolean = true, // env - FPGAPlatform: Boolean = false + FPGAPlatform: Boolean = false, + + // Network layer SAM + sam: Seq[(AddressSet, Int)] = Seq(AddressSet.everything -> 33) ) { def toCacheParams: CacheParameters = CacheParameters( name = name, @@ -113,10 +117,14 @@ case class L2Param ) } +case object L2ParamKey extends Field[L2Param](L2Param()) + case object EdgeInKey extends Field[TLEdgeIn] case object EdgeOutKey extends Field[TLEdgeOut] case object BankBitsKey extends Field[Int] +case object L2NBanksKey extends Field[Int] + case object SliceIdKey extends Field[Int] diff --git a/src/main/scala/coupledL2/MSHRBuffer.scala b/src/main/scala/coupledL2/MSHRBuffer.scala index f13adcb6..fe012c2e 100644 --- a/src/main/scala/coupledL2/MSHRBuffer.scala +++ b/src/main/scala/coupledL2/MSHRBuffer.scala @@ -21,7 +21,6 @@ import chisel3._ import chisel3.util._ import org.chipsalliance.cde.config.Parameters import coupledL2.utils._ -import java.util.ResourceBundle class MSHRBufRead(implicit p: Parameters) extends L2Bundle { val id = Output(UInt(mshrBits.W)) @@ -34,6 +33,7 @@ class MSHRBufResp(implicit p: Parameters) extends L2Bundle { class MSHRBufWrite(implicit p: Parameters) extends L2Bundle { val id = Output(UInt(mshrBits.W)) val data = Output(new DSBlock) + val beatMask = Output(UInt(beatSize.W)) } // MSHR Buffer is used when MSHR needs to save data, so each buffer entry corresponds to an MSHR @@ -44,7 +44,7 @@ class MSHRBuffer(wPorts: Int = 1)(implicit p: Parameters) extends L2Module { val w = Vec(wPorts, Flipped(ValidIO(new MSHRBufWrite))) }) - val buffer = Reg(Vec(mshrsAll, new DSBlock)) + val buffer = Reg(Vec(mshrsAll, Vec(beatSize, UInt((beatBytes * 8).W)))) buffer.zipWithIndex.foreach { case (block, i) => @@ -52,13 +52,17 @@ class MSHRBuffer(wPorts: Int = 1)(implicit p: Parameters) extends L2Module { assert(PopCount(wens) <= 2.U, "triple write to the same MSHR buffer entry") val w_data = PriorityMux(wens, io.w.map(_.bits.data)) + val w_beatSel = PriorityMux(wens, io.w.map(_.bits.beatMask)) when(wens.orR) { - block := w_data + // block := w_data + block.zip(w_beatSel.asBools).zipWithIndex.foreach { case ((beat, sel), i) => + when (sel) { beat := w_data.data((i+1) * beatBytes * 8 - 1, i * beatBytes * 8) } + } } } val ridReg = RegEnable(io.r.bits.id, 0.U(mshrBits.W), io.r.valid) - io.resp.data := buffer(ridReg) + io.resp.data.data := buffer(ridReg).asUInt } // may consider just choose an empty entry to insert diff --git a/src/main/scala/coupledL2/RequestArb.scala b/src/main/scala/coupledL2/RequestArb.scala index e3186023..7e9a6a0b 100644 --- a/src/main/scala/coupledL2/RequestArb.scala +++ b/src/main/scala/coupledL2/RequestArb.scala @@ -24,6 +24,9 @@ import freechips.rocketchip.tilelink._ import freechips.rocketchip.tilelink.TLMessages._ import org.chipsalliance.cde.config.Parameters import coupledL2.utils.XSPerfAccumulate +import coupledL2.tl2tl._ +import coupledL2.tl2chi._ +import coupledL2.tl2chi.CHIOpcode._ class RequestArb(implicit p: Parameters) extends L2Module { val io = IO(new Bundle() { @@ -53,7 +56,8 @@ class RequestArb(implicit p: Parameters) extends L2Module { /* status of each pipeline stage */ val status_s1 = Output(new PipeEntranceStatus) // set & tag of entrance status - val status_vec = Vec(2, ValidIO(new PipeStatus)) // whether this stage will flow into SourceD + val status_vec = Vec(2, ValidIO(new PipeStatus)) + val status_vec_toTX = if (enableCHI) Some(Vec(2, ValidIO(new PipeStatusWithCHI))) else None /* handle set conflict, capacity conflict */ val fromMSHRCtl = Input(new BlockInfo()) @@ -62,10 +66,13 @@ class RequestArb(implicit p: Parameters) extends L2Module { val blockSinkReqEntrance = new BlockInfo() val blockMSHRReqEntrance = Bool() }) - val fromSourceC = Input(new Bundle() { - val blockSinkBReqEntrance = Bool() - val blockMSHRReqEntrance = Bool() - }) + val fromSourceC = if (!enableCHI) Some(Input(new SourceCBlockBundle)) else None + val fromTXDAT = if (enableCHI) Some(Input(new TXDATBlockBundle)) else None + val fromTXRSP = if (enableCHI) Some(Input(new TXRSPBlockBundle)) else None + val fromTXREQ = if (enableCHI) Some(Input(new TXBlockBundle)) else None + + /* MSHR Status */ + val msInfo = Vec(mshrsAll, Flipped(ValidIO(new MSHRInfo()))) }) /* ======== Reset ======== */ @@ -79,7 +86,10 @@ class RequestArb(implicit p: Parameters) extends L2Module { resetFinish := true.B } - val mshr_task_s0 = Wire(Valid(new TaskBundle())) + val s0_fire = Wire(Bool()) + val s1_fire = Wire(Bool()) + val s1_cango = Wire(Bool()) + val s2_ready = Wire(Bool()) val mshr_task_s1 = RegInit(0.U.asTypeOf(Valid(new TaskBundle()))) val s1_needs_replRead = mshr_task_s1.valid && mshr_task_s1.bits.fromA && mshr_task_s1.bits.replTask && ( @@ -91,35 +101,48 @@ class RequestArb(implicit p: Parameters) extends L2Module { /* ======== Stage 0 ======== */ // if mshr_task_s1 is replRead, it might stall and wait for dirRead.ready, so we block new mshrTask from entering // TODO: will cause msTask path vacant for one-cycle after replRead, since not use Flow so as to avoid ready propagation - io.mshrTask.ready := !io.fromGrantBuffer.blockMSHRReqEntrance && !s1_needs_replRead && !io.fromSourceC.blockMSHRReqEntrance - mshr_task_s0.valid := io.mshrTask.fire - mshr_task_s0.bits := io.mshrTask.bits + io.mshrTask.ready := !io.fromGrantBuffer.blockMSHRReqEntrance && !s1_needs_replRead && !(mshr_task_s1.valid && !s2_ready) + (if (io.fromSourceC.isDefined) !io.fromSourceC.get.blockMSHRReqEntrance else true.B) && + (if (io.fromTXDAT.isDefined) !io.fromTXDAT.get.blockMSHRReqEntrance else true.B) && + (if (io.fromTXRSP.isDefined) !io.fromTXRSP.get.blockMSHRReqEntrance else true.B) && + (if (io.fromTXREQ.isDefined) !io.fromTXREQ.get.blockMSHRReqEntrance else true.B) + + s0_fire := io.mshrTask.valid && io.mshrTask.ready /* ======== Stage 1 ======== */ /* latch mshr_task from s0 to s1 */ val mshr_replRead_stall = mshr_task_s1.valid && s1_needs_replRead && (!io.dirRead_s1.ready || io.fromMainPipe.blockG_s1) + mshr_task_s1.valid := mshr_task_s1.valid && !s1_fire || s0_fire - mshr_task_s1.valid := mshr_task_s0.valid || mshr_replRead_stall - when(mshr_task_s0.valid && !mshr_replRead_stall) { - mshr_task_s1.bits := mshr_task_s0.bits + when (s0_fire) { + mshr_task_s1.bits := io.mshrTask.bits } + /* Channel interaction from s1 */ val A_task = io.sinkA.bits val B_task = io.sinkB.bits val C_task = io.sinkC.bits val block_A = io.fromMSHRCtl.blockA_s1 || io.fromMainPipe.blockA_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockA_s1 - val block_B = io.fromMSHRCtl.blockB_s1 || io.fromMainPipe.blockB_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockB_s1 || io.fromSourceC.blockSinkBReqEntrance + val block_B = io.fromMSHRCtl.blockB_s1 || io.fromMainPipe.blockB_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockB_s1 || + (if (io.fromSourceC.isDefined) io.fromSourceC.get.blockSinkBReqEntrance else false.B) || + (if (io.fromTXDAT.isDefined) io.fromTXDAT.get.blockSinkBReqEntrance else false.B) || + (if (io.fromTXRSP.isDefined) io.fromTXRSP.get.blockSinkBReqEntrance else false.B) val block_C = io.fromMSHRCtl.blockC_s1 || io.fromMainPipe.blockC_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockC_s1 + val noFreeWay = Wire(Bool()) + val sinkValids = VecInit(Seq( io.sinkC.valid && !block_C, io.sinkB.valid && !block_B, - io.sinkA.valid && !block_A + io.sinkA.valid && !block_A && !noFreeWay )).asUInt - val sink_ready_basic = io.dirRead_s1.ready && resetFinish && !mshr_task_s1.valid - io.sinkA.ready := sink_ready_basic && !block_A && !sinkValids(1) && !sinkValids(0) // SinkC prior to SinkA & SinkB + // TODO: A Hint is allowed to enter if !s2_ready for mcp2_stall + + val sink_ready_basic = io.dirRead_s1.ready && resetFinish && !mshr_task_s1.valid && s2_ready + + io.sinkA.ready := sink_ready_basic && !block_A && !sinkValids(1) && !sinkValids(0) && !noFreeWay // SinkC prior to SinkA & SinkB io.sinkB.ready := sink_ready_basic && !block_B && !sinkValids(0) // SinkB prior to SinkA io.sinkC.ready := sink_ready_basic && !block_C @@ -132,12 +155,15 @@ class RequestArb(implicit p: Parameters) extends L2Module { val task_s1 = Mux(mshr_task_s1.valid, mshr_task_s1, chnl_task_s1) val s1_to_s2_valid = task_s1.valid && !mshr_replRead_stall - io.taskInfo_s1.valid := s1_to_s2_valid + s1_cango := task_s1.valid && !mshr_replRead_stall + s1_fire := s1_cango && s2_ready + + io.taskInfo_s1.valid := s1_fire io.taskInfo_s1.bits := task_s1.bits /* Meta read request */ // ^ only sinkA/B/C tasks need to read directory - io.dirRead_s1.valid := chnl_task_s1.valid && !mshr_task_s1.valid || s1_needs_replRead && !io.fromMainPipe.blockG_s1 + io.dirRead_s1.valid := s2_ready && (chnl_task_s1.valid && !mshr_task_s1.valid || s1_needs_replRead && !io.fromMainPipe.blockG_s1) io.dirRead_s1.bits.set := task_s1.bits.set io.dirRead_s1.bits.tag := task_s1.bits.tag // invalid way which causes mshr_retry @@ -150,7 +176,7 @@ class RequestArb(implicit p: Parameters) extends L2Module { io.dirRead_s1.bits.mshrId := task_s1.bits.mshrId // block same-set A req - io.s1Entrance.valid := mshr_task_s1.valid && mshr_task_s1.bits.metaWen || io.sinkC.fire || io.sinkB.fire + io.s1Entrance.valid := mshr_task_s1.valid && s2_ready && mshr_task_s1.bits.metaWen || io.sinkC.fire || io.sinkB.fire io.s1Entrance.bits.set := Mux( mshr_task_s1.valid && mshr_task_s1.bits.metaWen, mshr_task_s1.bits.set, @@ -158,9 +184,22 @@ class RequestArb(implicit p: Parameters) extends L2Module { ) /* ======== Stage 2 ======== */ + val s1_AHint_fire = io.sinkA.fire && io.sinkA.bits.opcode === Hint + // any req except AHint might access DS, and continuous DS accesses are prohibited + val ds_mcp2_stall = RegNext(s1_fire && !s1_AHint_fire) + + s2_ready := !ds_mcp2_stall + val task_s2 = RegInit(0.U.asTypeOf(task_s1)) - task_s2.valid := s1_to_s2_valid - when(s1_to_s2_valid) { task_s2.bits := task_s1.bits } + task_s2.valid := s1_fire + when(s1_fire) { task_s2.bits := task_s1.bits } + + val sameSet_s2 = task_s2.valid && task_s2.bits.fromA && !task_s2.bits.mshrTask && task_s2.bits.set === A_task.set + val sameSet_s3 = RegNext(task_s2.valid && task_s2.bits.fromA && !task_s2.bits.mshrTask) && + RegEnable(task_s2.bits.set, task_s2.valid) === A_task.set + val sameSetCnt = PopCount(VecInit(io.msInfo.map(s => s.valid && s.bits.set === A_task.set && s.bits.fromA) :+ + sameSet_s2 :+ sameSet_s3).asUInt) + noFreeWay := sameSetCnt >= cacheParams.ways.U io.taskToPipe_s2 := task_s2 @@ -172,19 +211,49 @@ class RequestArb(implicit p: Parameters) extends L2Module { // For GrantData, read refillBuffer // Caution: GrantData-alias may read DataStorage or ReleaseBuf instead // Release-replTask also read refillBuf and then write to DS + val releaseRefillData = task_s2.bits.replTask && (if (enableCHI) { + task_s2.bits.toTXREQ && ( + task_s2.bits.chiOpcode.get === REQOpcodes.WriteBackFull || + task_s2.bits.chiOpcode.get === REQOpcodes.Evict + ) + } else { + task_s2.bits.opcode(2, 1) === Release(2, 1) + }) io.refillBufRead_s2.valid := mshrTask_s2 && ( - task_s2.bits.fromB && task_s2.bits.opcode(2, 1) === ProbeAck(2, 1) && task_s2.bits.replTask || - task_s2.bits.opcode(2, 1) === Release(2, 1) && task_s2.bits.replTask || + task_s2.bits.fromB && task_s2.bits.opcode(2, 1) === ProbeAck(2, 1) && task_s2.bits.replTask || // ??? + releaseRefillData || mshrTask_s2_a_upwards && !task_s2.bits.useProbeData) io.refillBufRead_s2.bits.id := task_s2.bits.mshrId // ReleaseData and ProbeAckData read releaseBuffer // channel is used to differentiate GrantData and ProbeAckData - io.releaseBufRead_s2.valid := mshrTask_s2 && ( - task_s2.bits.opcode === ReleaseData || - task_s2.bits.fromB && task_s2.bits.opcode === ProbeAckData || - mshrTask_s2_a_upwards && task_s2.bits.useProbeData) - io.releaseBufRead_s2.bits.id := task_s2.bits.mshrId + val snoopNeedData = if (enableCHI) { + task_s2.bits.fromB && task_s2.bits.toTXDAT && DATOpcodes.isSnpRespDataX(task_s2.bits.chiOpcode.get) + } else { + task_s2.bits.fromB && task_s2.bits.opcode === ProbeAckData + } + val releaseNeedData = if (enableCHI) { + task_s2.bits.toTXDAT && task_s2.bits.chiOpcode.get === DATOpcodes.CopyBackWrData + } else task_s2.bits.opcode === ReleaseData + val dctNeedData = if (enableCHI) { + task_s2.bits.toTXDAT && task_s2.bits.chiOpcode.get === DATOpcodes.CompData + } else false.B + val snpHitReleaseNeedData = if (enableCHI) { + !mshrTask_s2 && task_s2.bits.fromB && task_s2.bits.snpHitReleaseWithData + } else false.B + io.releaseBufRead_s2.valid := Mux( + mshrTask_s2, + releaseNeedData || + snoopNeedData || + dctNeedData || + mshrTask_s2_a_upwards && task_s2.bits.useProbeData, + snpHitReleaseNeedData + ) + io.releaseBufRead_s2.bits.id := Mux( + task_s2.bits.snpHitRelease, + task_s2.bits.snpHitReleaseIdx, + task_s2.bits.mshrId + ) require(beatSize == 2) @@ -192,6 +261,7 @@ class RequestArb(implicit p: Parameters) extends L2Module { io.status_s1.sets := VecInit(Seq(C_task.set, B_task.set, io.ASet, mshr_task_s1.bits.set)) io.status_s1.tags := VecInit(Seq(C_task.tag, B_task.tag, io.ATag, mshr_task_s1.bits.tag)) // io.status_s1.isKeyword := VecInit(Seq(C_task.isKeyword, B_task.isKeyword, io.isKeyword, mshr_task_s1.bits.isKeyword)) + require(io.status_vec.size == 2) io.status_vec.zip(Seq(task_s1, task_s2)).foreach { case (status, task) => @@ -199,10 +269,21 @@ class RequestArb(implicit p: Parameters) extends L2Module { status.bits.channel := task.bits.channel } + if (enableCHI) { + require(io.status_vec_toTX.get.size == 2) + io.status_vec_toTX.get.zip(Seq(task_s1, task_s2)).foreach { + case (status, task) => + status.valid := task.valid + status.bits.channel := task.bits.channel + status.bits.txChannel := task.bits.txChannel + status.bits.mshrTask := task.bits.mshrTask + } + } + dontTouch(io) // Performance counters - XSPerfAccumulate(cacheParams, "mshr_req", mshr_task_s0.valid) + XSPerfAccumulate(cacheParams, "mshr_req", s0_fire) XSPerfAccumulate(cacheParams, "mshr_req_stall", io.mshrTask.valid && !io.mshrTask.ready) XSPerfAccumulate(cacheParams, "sinkA_req", io.sinkA.fire) diff --git a/src/main/scala/coupledL2/RequestBuffer.scala b/src/main/scala/coupledL2/RequestBuffer.scala index d6619486..c3a6474c 100644 --- a/src/main/scala/coupledL2/RequestBuffer.scala +++ b/src/main/scala/coupledL2/RequestBuffer.scala @@ -1,3 +1,20 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + package coupledL2 import org.chipsalliance.cde.config.Parameters @@ -5,6 +22,7 @@ import freechips.rocketchip.tilelink.TLMessages._ import freechips.rocketchip.tilelink.TLPermissions._ import chisel3._ import chisel3.util._ +import coupledL2._ import coupledL2.utils._ import utility._ @@ -141,12 +159,12 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete e.valid && sameAddr(in, e.task) ) ).asUInt - val dup = io.in.valid && isPrefetch && dupMask.orR + val dup = isPrefetch && dupMask.orR //!! TODO: we can also remove those that duplicate with mainPipe /* ======== Alloc ======== */ - io.in.ready := !full || doFlow || mergeA + io.in.ready := !full || doFlow || mergeA || dup val insertIdx = PriorityEncoder(buffer.map(!_.valid)) val alloc = !full && io.in.valid && !doFlow && !dup && !mergeA @@ -208,7 +226,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete // so when waitMP(1) is 0 and waitMP(0) is 1, desired cycleCnt reached // we recalculate waitMS and occWays, overriding old mask // to take new allocated MSHR into account - e.waitMP := e.waitMP >> 1.U + e.waitMP := e.waitMP >> 1 when(e.waitMP(1) === 0.U && e.waitMP(0) === 1.U) { waitMSUpdate := conflictMask(e.task) } @@ -223,7 +241,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete val s1B_Block = io.s1Entrance.valid && io.s1Entrance.bits.set === e.task.set val s1_Block = s1A_Block || s1B_Block when(s1_Block) { - e.waitMP := e.waitMP | "b0100".U // fired-req at s2 next cycle + e.waitMP := (e.waitMP >> 1) | "b0100".U // fired-req at s2 next cycle } // update info @@ -252,7 +270,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete // add XSPerf to see how many cycles the req is held in Buffer if(cacheParams.enablePerf) { - XSPerfAccumulate(cacheParams, "drop_prefetch", dup) + XSPerfAccumulate(cacheParams, "drop_prefetch", io.in.valid && dup) if(flow){ XSPerfAccumulate(cacheParams, "req_buffer_flow", io.in.valid && doFlow) } diff --git a/src/main/scala/coupledL2/SinkA.scala b/src/main/scala/coupledL2/SinkA.scala index da142384..45764060 100644 --- a/src/main/scala/coupledL2/SinkA.scala +++ b/src/main/scala/coupledL2/SinkA.scala @@ -38,7 +38,9 @@ class SinkA(implicit p: Parameters) extends L2Module { def fromTLAtoTaskBundle(a: TLBundleA): TaskBundle = { val task = Wire(new TaskBundle) + task := 0.U.asTypeOf(new TaskBundle) task.channel := "b001".U + task.txChannel := 0.U task.tag := parseAddress(a.address)._1 task.set := parseAddress(a.address)._2 task.off := parseAddress(a.address)._3 @@ -67,7 +69,7 @@ class SinkA(implicit p: Parameters) extends L2Module { task.replTask := false.B task.vaddr.foreach(_ := a.user.lift(VaddrKey).getOrElse(0.U)) //miss acquire keyword - task.isKeyword.foreach(_ := a.echo.lift(IsKeywordKey).getOrElse(false.B)) + task.isKeyword.foreach(_ := a.echo.lift(IsKeywordKey).getOrElse(false.B)) task.mergeA := false.B task.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle) task @@ -75,6 +77,7 @@ class SinkA(implicit p: Parameters) extends L2Module { def fromPrefetchReqtoTaskBundle(req: PrefetchReq): TaskBundle = { val task = Wire(new TaskBundle) val fullAddr = Cat(req.tag, req.set, 0.U(offsetBits.W)) + task := 0.U.asTypeOf(new TaskBundle) task.channel := "b001".U task.tag := parseAddress(fullAddr)._1 task.set := parseAddress(fullAddr)._2 diff --git a/src/main/scala/coupledL2/SinkC.scala b/src/main/scala/coupledL2/SinkC.scala index 25fb58d7..29670309 100644 --- a/src/main/scala/coupledL2/SinkC.scala +++ b/src/main/scala/coupledL2/SinkC.scala @@ -67,7 +67,9 @@ class SinkC(implicit p: Parameters) extends L2Module { def toTaskBundle(c: TLBundleC): TaskBundle = { val task = Wire(new TaskBundle) + task := 0.U.asTypeOf(new TaskBundle) task.channel := "b100".U + task.txChannel := 0.U task.tag := parseAddress(c.address)._1 task.set := parseAddress(c.address)._2 task.off := parseAddress(c.address)._3 @@ -145,6 +147,7 @@ class SinkC(implicit p: Parameters) extends L2Module { io.resp.mshrId := 0.U // DontCare io.resp.tag := parseAddress(io.c.bits.address)._1 io.resp.set := parseAddress(io.c.bits.address)._2 + io.resp.respInfo := 0.U.asTypeOf(io.resp.respInfo.cloneType) io.resp.respInfo.opcode := io.c.bits.opcode io.resp.respInfo.param := io.c.bits.param io.resp.respInfo.last := last @@ -158,6 +161,7 @@ class SinkC(implicit p: Parameters) extends L2Module { io.releaseBufWrite.valid := io.c.valid && io.c.bits.opcode === ProbeAckData && last io.releaseBufWrite.bits.id := 0.U(mshrBits.W) // id is given by MSHRCtl by comparing address to the MSHRs io.releaseBufWrite.bits.data.data := Cat(io.c.bits.data, probeAckDataBuf) + io.releaseBufWrite.bits.beatMask := Fill(beatSize, true.B) // C-Release, with new data, comes before repl-Release writes old refill data back to DS val newdataMask = VecInit(io.msInfo.map(s => @@ -174,6 +178,7 @@ class SinkC(implicit p: Parameters) extends L2Module { io.refillBufWrite.valid := RegNext(io.task.fire && io.task.bits.opcode === ReleaseData && newdataMask.orR, false.B) io.refillBufWrite.bits.id := RegNext(OHToUInt(newdataMask)) io.refillBufWrite.bits.data.data := dataBuf(RegNext(io.task.bits.bufIdx)).asUInt + io.refillBufWrite.bits.beatMask := Fill(beatSize, true.B) io.c.ready := !isRelease || !first || !full diff --git a/src/main/scala/coupledL2/TopDownMonitor.scala b/src/main/scala/coupledL2/TopDownMonitor.scala index d1c1d777..00fd11b3 100644 --- a/src/main/scala/coupledL2/TopDownMonitor.scala +++ b/src/main/scala/coupledL2/TopDownMonitor.scala @@ -21,8 +21,10 @@ import chisel3._ import chisel3.util._ import coupledL2.prefetch.PfSource import coupledL2.utils._ +import coupledL2.tl2tl.MSHRStatus import utility.MemReqSource +// TODO: Accommodate CHI class TopDownMonitor()(implicit p: Parameters) extends L2Module { val banks = 1 << bankBits val io = IO(new Bundle() { diff --git a/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala b/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala index 92cde9d0..7a4a329b 100644 --- a/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala +++ b/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala @@ -27,11 +27,11 @@ import utility.{MemReqSource, Pipeline} // TODO: PrefetchReceiver is temporarily used since L1&L2 do not support Hint. // TODO: Delete this after Hint is accomplished. -case class PrefetchReceiverParams(n: Int = 32) extends PrefetchParameters { +case class PrefetchReceiverParams(n: Int = 32, tp: Boolean = true) extends PrefetchParameters { override val hasPrefetchBit: Boolean = true override val hasPrefetchSrc: Boolean = true override val inflightEntries: Int = n - val hasTPPrefetcher: Boolean = true + val hasTPPrefetcher: Boolean = tp } class PrefetchReceiver()(implicit p: Parameters) extends PrefetchModule { diff --git a/src/main/scala/coupledL2/prefetch/Prefetcher.scala b/src/main/scala/coupledL2/prefetch/Prefetcher.scala index cf08fc61..a28511fa 100644 --- a/src/main/scala/coupledL2/prefetch/Prefetcher.scala +++ b/src/main/scala/coupledL2/prefetch/Prefetcher.scala @@ -291,9 +291,15 @@ class Prefetcher(implicit p: Parameters) extends PrefetchModule { ) ))) }))) - val tp = Module(new TemporalPrefetch()(p.alterPartial({ - case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(TPParameters())) - }))) + val tp = prefetchOpt match { + case Some(param: PrefetchReceiverParams) => + if (param.hasTPPrefetcher) { + Some(Module(new TemporalPrefetch()(p.alterPartial({ + case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(TPParameters())) + })))) + } else None + case _ => None + } val pftQueue = Module(new PrefetchQueue) val pipe = Module(new Pipeline(io.req.bits.cloneType, 1)) val l2_pf_en = RegNextN(io_l2_pf_en, 2, Some(true.B)) @@ -325,93 +331,43 @@ class Prefetcher(implicit p: Parameters) extends PrefetchModule { pbop.io.train.valid := io.train.valid && (io.train.bits.reqsource =/= MemReqSource.L1DataPrefetch.id.U) pbop.io.resp <> io.resp pbop.io.resp.valid := io.resp.valid && io.resp.bits.isPBOP - tp.io.train <> io.train - tp.io.resp <> io.resp - tp.io.hartid := hartId + tp.foreach(_.io.train <> io.train) + tp.foreach(_.io.resp <> io.resp) + tp.foreach(_.io.hartid := hartId) pfRcv.io.req.ready := true.B vbop.io.req.ready := true.B pbop.io.req.ready := true.B - tp.io.req.ready := !pfRcv.io.req.valid && !vbop.io.req.valid + tp.foreach(_.io.req.ready := !pfRcv.io.req.valid && !vbop.io.req.valid) pipe.io.in <> pftQueue.io.deq io.req <> pipe.io.out // tpmeta interface - tp.io.tpmeta_port <> tpio.tpmeta_port.get + tp.foreach(_.io.tpmeta_port <> tpio.tpmeta_port.get) /* pri vbop */ - pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || tp.io.req.valid)) + pftQueue.io.enq.valid := pfRcv.io.req.valid || + (l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || (if (tp.isDefined) tp.get.io.req.valid else false.B))) pftQueue.io.enq.bits := ParallelPriorityMux(Seq( pfRcv.io.req.valid -> pfRcv.io.req.bits, vbop.io.req.valid -> vbop.io.req.bits, pbop.io.req.valid -> pbop.io.req.bits, - tp.io.req.valid -> tp.io.req.bits + if (tp.isDefined) { tp.get.io.req.valid -> tp.get.io.req.bits } + else { false.B -> DontCare } )) XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid) XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid) XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid) - XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid) + if (tp.isDefined) + XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.get.io.req.valid) XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid) XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && vbop.io.req.valid) XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && pbop.io.req.valid) - XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.io.req.valid) + if (tp.isDefined) + XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.get.io.req.valid) XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped", - pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid)) - - /* pri pbop */ - // pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || tp.io.req.valid)) - // pftQueue.io.enq.bits := ParallelPriorityMux(Seq( - // pfRcv.io.req.valid -> pfRcv.io.req.bits, - // pbop.io.req.valid -> pbop.io.req.bits, - // vbop.io.req.valid -> vbop.io.req.bits, - // tp.io.req.valid -> tp.io.req.bits - // )) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && pbop.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && !pbop.io.req.valid && vbop.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped", - // pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid)) - - /* solo vbop */ - // vbop.io.pbopCrossPage := true.B - // pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || tp.io.req.valid)) - // pftQueue.io.enq.bits := ParallelPriorityMux(Seq( - // pfRcv.io.req.valid -> pfRcv.io.req.bits, - // vbop.io.req.valid -> vbop.io.req.bits, - // tp.io.req.valid -> tp.io.req.bits - // )) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && vbop.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && tp.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped", - // pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid)) - - /* solo pbop */ - // vbop.io.train.valid := false.B - // vbop.io.resp.valid := false.B - // pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (pbop.io.req.valid || tp.io.req.valid)) - // pftQueue.io.enq.bits := ParallelPriorityMux(Seq( - // pfRcv.io.req.valid -> pfRcv.io.req.bits, - // pbop.io.req.valid -> pbop.io.req.bits, - // tp.io.req.valid -> tp.io.req.bits - // )) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && pbop.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !pbop.io.req.valid && tp.io.req.valid) - // XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped", - // pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid)) + pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || (if (tp.isDefined) tp.get.io.req.valid else false.B))) case _ => assert(cond = false, "Unknown prefetcher") } -} \ No newline at end of file +} diff --git a/src/main/scala/coupledL2/tl2chi/Bundle.scala b/src/main/scala/coupledL2/tl2chi/Bundle.scala new file mode 100644 index 00000000..ab5f7e54 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/Bundle.scala @@ -0,0 +1,74 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.cde.config.Parameters +import freechips.rocketchip.tilelink.TLPermissions._ +import utility.MemReqSource +import coupledL2.{HasTLChannelBits, DirResult, PipeStatus} + +object CHIChannel { + def TXREQ = "b001".U + def TXRSP = "b010".U + def TXDAT = "b100".U +} + +trait HasCHIChannelBits { this: Bundle => + val txChannel = UInt(3.W) + def toTXREQ = txChannel(0).asBool + def toTXRSP = txChannel(1).asBool + def toTXDAT = txChannel(2).asBool +} + +class PipeStatusWithCHI(implicit p: Parameters) extends PipeStatus + with HasCHIChannelBits { + val mshrTask = Bool() +} + +class MSHRStatus(implicit p: Parameters) extends TL2CHIL2Bundle + with HasTLChannelBits + with HasCHIChannelBits { + // TODO + val set = UInt(setBits.W) + val reqTag = UInt(tagBits.W) + val metaTag = UInt(tagBits.W) + val needsRepl = Bool() + val w_c_resp = Bool() + val w_d_resp = Bool() + val will_free = Bool() + +// val way = UInt(wayBits.W) +// val off = UInt(offsetBits.W) +// val opcode = UInt(3.W) +// val param = UInt(3.W) +// val size = UInt(msgSizeBits.W) +// val source = UInt(sourceIdBits.W) +// val alias = aliasBitsOpt.map(_ => UInt(aliasBitsOpt.get.W)) +// val aliasTask = aliasBitsOpt.map(_ => Bool()) +// val needProbeAckData = Bool() // only for B reqs +// val fromL2pft = prefetchOpt.map(_ => Bool()) +// val needHint = prefetchOpt.map(_ => Bool()) + + // for TopDown usage + val reqSource = UInt(MemReqSource.reqSourceBits.W) + val is_miss = Bool() + val is_prefetch = Bool() + +} diff --git a/src/main/scala/coupledL2/tl2chi/MMIOBridge.scala b/src/main/scala/coupledL2/tl2chi/MMIOBridge.scala new file mode 100644 index 00000000..9140fd27 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/MMIOBridge.scala @@ -0,0 +1,327 @@ + +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import utility._ +import org.chipsalliance.cde.config.Parameters +import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.tilelink.TLMessages._ +import coupledL2.HasCoupledL2Parameters +import coupledL2.tl2chi.CHIOpcode._ + +class MMIOBridge()(implicit p: Parameters) extends LazyModule + with HasCoupledL2Parameters + with HasCHIMsgParameters { + + override def shouldBeInlined: Boolean = false + + /** + * MMIO node + */ + val onChipPeripheralRange = AddressSet(0x38000000L, 0x07ffffffL) + val uartRange = AddressSet(0x40600000, 0xf) + val uartDevice = new SimpleDevice("serial", Seq("xilinx,uartlite")) + val uartParams = TLSlaveParameters.v1( + address = Seq(uartRange), + resources = uartDevice.reg, + regionType = RegionType.UNCACHED, + supportsGet = TransferSizes(1, 8), + supportsPutFull = TransferSizes(1, 8), + supportsPutPartial = TransferSizes(1, 8) + ) + val peripheralRange = AddressSet( + 0x0, 0x7fffffff + ).subtract(onChipPeripheralRange).flatMap(x => x.subtract(uartRange)) + + val mmioNode = TLManagerNode(Seq(TLSlavePortParameters.v1( + managers = Seq(TLSlaveParameters.v1( + address = peripheralRange, + regionType = RegionType.UNCACHED, + supportsGet = TransferSizes(1, 8), + supportsPutFull = TransferSizes(1, 8), + supportsPutPartial = TransferSizes(1, 8) + ), uartParams), + beatBytes = 8 + ))) + + lazy val module = new MMIOBridgeImp(this) + +} + +class MMIOBridgeEntry(edge: TLEdgeIn)(implicit p: Parameters) extends TL2CHIL2Module { + + val needRR = true + val order = WireInit(if (needRR) OrderEncodings.EndpointOrder else OrderEncodings.None) + + val io = IO(new Bundle() { + val req = Flipped(DecoupledIO(new TLBundleA(edge.bundle))) + val resp = DecoupledIO(new TLBundleD(edge.bundle)) + val chi = new DecoupledNoSnpPortIO + val id = Input(UInt()) + val pCrdQuery = Output(ValidIO(new Bundle() { + val pCrdType = UInt(PCRDTYPE_WIDTH.W) + })) + val pCrdGrant = Input(Bool()) + val waitOnReadReceipt = Option.when(needRR)(Output(Bool())) + }) + + val s_txreq = RegInit(true.B) + val s_ncbwrdata = RegInit(true.B) + // val s_readrecript = RegInit(true.B) // TODO + // val s_compack = RegInit(true.B) // TODO + val s_resp = RegInit(true.B) + val w_comp = RegInit(true.B) + val w_dbidresp = RegInit(true.B) + val w_compdata = RegInit(true.B) + val w_pcrdgrant = RegInit(true.B) + val w_readreceipt = Option.when(needRR)(RegInit(true.B)) + + val no_schedule = s_txreq && s_ncbwrdata && s_resp + val no_wait = w_comp && w_dbidresp && w_compdata && w_pcrdgrant && w_readreceipt.getOrElse(true.B) + + val req = RegEnable(io.req.bits, io.req.fire) + val req_valid = !no_schedule || !no_wait + val rdata = Reg(UInt(DATA_WIDTH.W)) + val srcID = Reg(UInt(SRCID_WIDTH.W)) + val dbID = Reg(UInt(DBID_WIDTH.W)) + val allowRetry = RegInit(true.B) + val pCrdType = Reg(UInt(PCRDTYPE_WIDTH.W)) + val isRead = req.opcode === Get + + val wordBits = io.req.bits.data.getWidth // 64 + val wordBytes = wordBits / 8 + val words = DATA_WIDTH / wordBits + val wordIdxBits = log2Ceil(words) + require(wordBits == 64) + require(wordIdxBits == 2) + val reqWordIdx = (req.address >> log2Ceil(wordBytes))(wordIdxBits - 1, 0) + + val txreq = io.chi.tx.req + val txdat = io.chi.tx.dat + val rxdat = io.chi.rx.dat + val rxrsp = io.chi.rx.rsp + + /** + * Entry allocation + */ + when (io.req.fire) { + s_txreq := false.B + s_resp := false.B + allowRetry := true.B + when (io.req.bits.opcode === Get) { + w_compdata := false.B + w_readreceipt.foreach(_ := false.B) + }.elsewhen (io.req.bits.opcode === PutFullData || io.req.bits.opcode === PutPartialData) { + w_comp := false.B + w_dbidresp := false.B + s_ncbwrdata := false.B + } + } + + /** + * State flags recover + */ + when (txreq.fire) { + s_txreq := true.B + } + when (rxdat.fire) { + w_compdata := true.B + rdata := rxdat.bits.data + } + when (io.resp.fire) { + s_resp := true.B + } + when (rxrsp.fire) { + when (rxrsp.bits.opcode === RSPOpcodes.CompDBIDResp || rxrsp.bits.opcode === RSPOpcodes.Comp) { + w_comp := true.B + } + when (rxrsp.bits.opcode === RSPOpcodes.CompDBIDResp || rxrsp.bits.opcode === RSPOpcodes.DBIDResp) { + w_dbidresp := true.B + srcID := rxrsp.bits.srcID + dbID := rxrsp.bits.dbID + } + when (rxrsp.bits.opcode === RSPOpcodes.RetryAck) { + s_txreq := false.B + w_pcrdgrant := false.B + allowRetry := false.B + pCrdType := rxrsp.bits.pCrdType + } + when (rxrsp.bits.opcode === RSPOpcodes.ReadReceipt) { + w_readreceipt.foreach(_ := true.B) + } + } + when (txdat.fire) { + s_ncbwrdata := true.B + } + when (io.pCrdGrant) { + w_pcrdgrant := true.B + } + + /** + * IO Assignment + */ + io.req.ready := no_schedule && no_wait + txreq.valid := !s_txreq && w_pcrdgrant + txreq.bits := 0.U.asTypeOf(txreq.bits.cloneType) + txreq.bits.tgtID := SAM(sam).lookup(txreq.bits.addr) + txreq.bits.txnID := io.id + txreq.bits.opcode := ParallelLookUp(req.opcode, Seq( + Get -> REQOpcodes.ReadNoSnp, + PutFullData -> REQOpcodes.WriteNoSnpFull, + PutPartialData -> REQOpcodes.WriteNoSnpPtl + )) + txreq.bits.size := req.size + txreq.bits.addr := req.address + txreq.bits.allowRetry := allowRetry + txreq.bits.order := order + txreq.bits.pCrdType := Mux(allowRetry, 0.U, pCrdType) + txreq.bits.memAttr := MemAttr(allocate = false.B, cacheable = false.B, device = true.B, ewa = false.B) + txreq.bits.expCompAck := false.B + + io.resp.valid := !s_resp && Mux(isRead, w_compdata, w_comp && w_dbidresp && s_ncbwrdata) + io.resp.bits.opcode := Mux(isRead, AccessAckData, AccessAck) + io.resp.bits.param := 0.U // reserved + io.resp.bits.size := req.size + io.resp.bits.source := req.source + io.resp.bits.sink := 0.U // ignored + io.resp.bits.denied := false.B + io.resp.bits.corrupt := false.B + io.resp.bits.data := ParallelLookUp( + reqWordIdx, + List.tabulate(words)(i => i.U -> rdata((i + 1) * wordBits - 1, i * wordBits)) + ) + + txdat.valid := !s_ncbwrdata && w_dbidresp + txdat.bits := 0.U.asTypeOf(txdat.bits.cloneType) + txdat.bits.tgtID := srcID + txdat.bits.txnID := dbID + txdat.bits.opcode := DATOpcodes.NonCopyBackWrData + txdat.bits.ccID := 0.U + txdat.bits.dataID := 0.U + txdat.bits.be := ParallelLookUp( + reqWordIdx, + List.tabulate(words)(i => i.U -> (ZeroExt(req.mask, BE_WIDTH) << (i * wordBytes))) + ) + txdat.bits.data := Fill(words, req.data) & FillInterleaved(8, txdat.bits.be) + + rxrsp.ready := (!w_comp || !w_dbidresp || !w_readreceipt.getOrElse(true.B)) && s_txreq + rxdat.ready := !w_compdata && s_txreq + + io.pCrdQuery.valid := !w_pcrdgrant + io.pCrdQuery.bits.pCrdType := pCrdType + + io.waitOnReadReceipt.foreach(_ := !w_readreceipt.get && (s_txreq || !allowRetry)) +} + +class MMIOBridgeImp(outer: MMIOBridge) extends LazyModuleImp(outer) + with HasCoupledL2Parameters + with HasCHIMsgParameters { + + val (bus, edge) = outer.mmioNode.in.head + + val io = IO(new DecoupledNoSnpPortIO) + + val entries = Seq.fill(mmioBridgeSize) { Module(new MMIOBridgeEntry(edge)) } + val readys = VecInit(entries.map(_.io.req.ready)) + val selectOH = ParallelPriorityMux(readys.zipWithIndex.map { case (ready, i) => + ready -> (1 << i).U + }).asBools + + /** + * When a ReadNoSnp requires RequestOrder or Endpoint Order, the requester requires a ReadReceipt to determine + * when it can send the next ordered request. + */ + val waitOnReadReceiptVec = entries.map(e => e.io.waitOnReadReceipt.getOrElse(false.B)) + val waitOnReadReceipt = Cat(waitOnReadReceiptVec).orR + + /** + * Protocol Retry + */ + val pCrdValids = RegInit(VecInit(Seq.fill(mmioBridgeSize)(false.B))) + val pCrdTypes = Reg(Vec(mmioBridgeSize, UInt(PCRDTYPE_WIDTH.W))) + val pCrdInsertOH = PriorityEncoderOH(pCrdValids.map(!_)) + val isPCrdGrant = io.rx.rsp.bits.opcode === RSPOpcodes.PCrdGrant + val pCrdMatch = Wire(Vec(mmioBridgeSize, Vec(mmioBridgeSize, Bool()))) + val pCrdMatchEntryVec = pCrdMatch.map(_.asUInt.orR) + val pCrdMatchEntryOH = PriorityEncoderOH(pCrdMatchEntryVec) + val pCrdFreeOH = ParallelPriorityMux( + pCrdMatchEntryVec, + pCrdMatch.map(x => VecInit(PriorityEncoderOH(x))) + ) + + when (io.rx.rsp.valid && isPCrdGrant) { + pCrdValids.zip(pCrdInsertOH).foreach { case (v, insert) => + when (insert) { v := true.B } + assert(!(v && insert), "P-Credit overflow") + } + pCrdTypes.zip(pCrdInsertOH).foreach { case (t, insert) => + when (insert) { t := io.rx.rsp.bits.pCrdType } + } + } + pCrdFreeOH.zip(pCrdValids).foreach { case (free, v) => + when (free) { v := false.B } + } + + entries.zipWithIndex.foreach { case (entry, i) => + entry.io.req.valid := bus.a.valid && selectOH(i) + entry.io.req.bits := bus.a.bits + + entry.io.chi.rx.dat.valid := io.rx.dat.valid && io.rx.dat.bits.txnID === i.U + entry.io.chi.rx.dat.bits := io.rx.dat.bits + + entry.io.chi.rx.rsp.valid := io.rx.rsp.valid && io.rx.rsp.bits.txnID === i.U + entry.io.chi.rx.rsp.bits := io.rx.rsp.bits + + entry.io.id := i.U + + pCrdMatch(i) := VecInit(pCrdValids.zip(pCrdTypes).map { case (v, t) => + entry.io.pCrdQuery.valid && v && + entry.io.pCrdQuery.bits.pCrdType === t + }) + entry.io.pCrdGrant := pCrdMatchEntryOH(i) + } + + val txreqArb = Module(new Arbiter(chiselTypeOf(io.tx.req.bits), mmioBridgeSize)) + for ((a, req) <- txreqArb.io.in.zip(entries.map(_.io.chi.tx.req))) { + a <> req + val isReadNoSnp = req.bits.opcode === REQOpcodes.ReadNoSnp + val block = isReadNoSnp && waitOnReadReceipt + req.ready := a.ready && !block + a.valid := req.valid && !block + } + io.tx.req <> txreqArb.io.out + // arb(entries.map(_.io.chi.tx.req), io.tx.req, Some("mmio_txreq")) + arb(entries.map(_.io.chi.tx.dat), io.tx.dat, Some("mmio_txdat")) + arb(entries.map(_.io.resp), bus.d, Some("mmio_channel_D")) + + bus.a.ready := Cat(readys).orR + + io.rx.dat.ready := Cat(entries.zipWithIndex.map { case (entry, i) => + entry.io.chi.rx.dat.ready && io.rx.dat.bits.txnID === i.U + }).orR + io.rx.rsp.ready := Cat(entries.zipWithIndex.map { case (entry, i) => + entry.io.chi.rx.rsp.ready && io.rx.rsp.bits.txnID === i.U + }).orR || isPCrdGrant + + dontTouch(io) + dontTouch(bus) +} \ No newline at end of file diff --git a/src/main/scala/coupledL2/tl2chi/MSHR.scala b/src/main/scala/coupledL2/tl2chi/MSHR.scala new file mode 100644 index 00000000..21bc855b --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/MSHR.scala @@ -0,0 +1,1037 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import coupledL2.MetaData._ +import utility.{MemReqSource, ParallelLookUp, ParallelPriorityMux} +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.tilelink.TLMessages._ +import freechips.rocketchip.tilelink.TLPermissions._ +import org.chipsalliance.cde.config.Parameters +import coupledL2.prefetch.{PfSource, PrefetchTrain} +import coupledL2.utils.XSPerfAccumulate +import coupledL2.tl2chi.CHIOpcode._ +import coupledL2.tl2chi.CHIOpcode.DATOpcodes._ +import coupledL2.tl2chi.CHIOpcode.REQOpcodes._ +import coupledL2.tl2chi.CHIOpcode.RSPOpcodes._ +import coupledL2.tl2chi.CHIOpcode.SNPOpcodes._ +import coupledL2.tl2chi.CHICohStates._ +import coupledL2.tl2chi.CHIChannel +import coupledL2.MetaData._ +import coupledL2._ + + +class MSHRTasks(implicit p: Parameters) extends TL2CHIL2Bundle { + // outer + val txreq = DecoupledIO(new CHIREQ) //TODO: no need to use decoupled Shandshake + val txrsp = DecoupledIO(new CHIRSP) //TODO: no need to use decoupled handshake + val source_b = DecoupledIO(new SourceBReq) + val mainpipe = DecoupledIO(new TaskBundle) // To Mainpipe (SourceC or SourceD) + // val prefetchTrain = prefetchOpt.map(_ => DecoupledIO(new PrefetchTrain)) // To prefetcher +} + +class MSHRResps(implicit p: Parameters) extends TL2CHIL2Bundle { + val sinkC = Flipped(ValidIO(new RespInfoBundle)) + val rxrsp = Flipped(ValidIO(new RespInfoBundle)) + val rxdat = Flipped(ValidIO(new RespInfoBundle)) +// val rxrsp = new RespBundle() +// val rxdat = new RespBundle() +} + +class MSHR(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + val id = Input(UInt(mshrBits.W)) + val status = ValidIO(new MSHRStatus) + val msInfo = ValidIO(new MSHRInfo) + val alloc = Flipped(ValidIO(new MSHRRequest)) + val tasks = new MSHRTasks() + val resps = new MSHRResps() + val nestedwb = Input(new NestedWriteback) + val nestedwbData = Output(Bool()) + val aMergeTask = Flipped(ValidIO(new TaskBundle)) + val replResp = Flipped(ValidIO(new ReplacerResult)) + val pCamPri = Input(Bool()) + val waitPCrdInfo = Output(new PCrdInfo) + }) + + require (chiOpt.isDefined) + + val gotT = RegInit(false.B) // L3 might return T even though L2 wants B + val gotDirty = RegInit(false.B) + val gotGrantData = RegInit(false.B) + val probeDirty = RegInit(false.B) + val probeGotN = RegInit(false.B) + val timer = RegInit(0.U(64.W)) // for performance analysis + + val req_valid = RegInit(false.B) + val req = RegInit(0.U.asTypeOf(new TaskBundle())) + val dirResult = RegInit(0.U.asTypeOf(new DirResult())) + val meta = dirResult.meta + val initState = Wire(new FSMState()) + initState.elements.foreach(_._2 := true.B) + val state = RegInit(new FSMState(), initState) + + /** + * When all the ways are occupied with some mshr, other mshrs with the same set may retry to find a way to replace + * over and over again, which may block the entrance of main pipe and lead to potential deadlock. To resolve the + * problem, we allow mshr to retry immediately for 3 times (backoffThreshold). If it still fails to find a way, the + * mshr must back off for a period of time (backoffCycles) to yield the opportunity to access main pipe. + */ + val backoffThreshold = 3 + val backoffCycles = 20 + val retryTimes = RegInit(0.U(log2Up(backoffThreshold).W)) + val backoffTimer = RegInit(0.U(log2Up(backoffCycles).W)) + + //for CHI + val srcid = RegInit(0.U(NODEID_WIDTH.W)) + val homenid = RegInit(0.U(NODEID_WIDTH.W)) + val dbid = RegInit(0.U(DBID_WIDTH.W)) + val pcrdtype = RegInit(0.U(PCRDTYPE_WIDTH.W)) + val gotRetryAck = RegInit(false.B) + val gotPCrdGrant = RegInit(false.B) + val gotReissued = RegInit(false.B) + val metaChi = ParallelLookUp( + Cat(meta.dirty, meta.state), + Seq( + Cat(false.B, INVALID)-> I, + Cat(false.B, BRANCH) -> SC, + Cat(false.B, TRUNK) -> UC, + Cat(false.B, TIP) -> UC, + Cat( true.B, TRUNK) -> UD, + Cat( true.B, TIP) -> UD + )) + //for PCrdGrant info. search + io.waitPCrdInfo.valid := gotRetryAck && !gotReissued + io.waitPCrdInfo.srcID.get := srcid + io.waitPCrdInfo.pCrdType.get := pcrdtype + + /* Allocation */ + when(io.alloc.valid) { + req_valid := true.B + state := io.alloc.bits.state + dirResult := io.alloc.bits.dirResult + req := io.alloc.bits.task + gotT := false.B + gotDirty := false.B + gotGrantData := false.B + probeDirty := false.B + probeGotN := false.B + timer := 1.U + + gotRetryAck := false.B + gotPCrdGrant := false.B + gotReissued := false.B + srcid := 0.U + dbid := 0.U + pcrdtype := 0.U + + retryTimes := 0.U + backoffTimer := 0.U + } + + /* ======== Enchantment ======== */ + val meta_pft = meta.prefetch.getOrElse(false.B) + val meta_no_client = !meta.clients.orR + + val req_needT = needT(req.opcode, req.param) + val req_needB = needB(req.opcode, req.param) + val req_acquire = req.opcode === AcquireBlock && req.fromA || req.opcode === AcquirePerm // AcquireBlock and Probe share the same opcode + val req_acquirePerm = req.opcode === AcquirePerm + val req_get = req.opcode === Get + val req_prefetch = req.opcode === Hint + + val req_chiOpcode = req.chiOpcode.get + + val snpToN = isSnpToN(req_chiOpcode) + val snpToB = isSnpToB(req_chiOpcode) + + /** + * About which snoop should echo SnpRespData[Fwded] instead of SnpResp[Fwded]: + * 1. When the snooped block is dirty, always echo SnpRespData[Fwded], except for SnpMakeInvalid*, SnpStash*, + * SnpOnceFwd, and SnpUniqueFwd. + * 2. When the snoop opcode is SnpCleanFwd, SnpNotSharedDirtyFwd or SnpSharedFwd, always echo SnpRespDataFwded + * if RetToSrc = 1 as long as the snooped block is valid. + * 3. When the snoop opcode is non-forwarding non-stashing snoop, echo SnpRespData if RetToSrc = 1 as long as the + * cache line is Shared Clean and the snoopee retains a copy of the cache line. + */ + val doRespData_dirty = (isT(meta.state) && meta.dirty || probeDirty) && ( + req_chiOpcode === SnpOnce || + snpToB || + req_chiOpcode === SnpUnique || + req_chiOpcode === SnpUniqueStash || + req_chiOpcode === SnpCleanShared || + req_chiOpcode === SnpCleanInvalid + ) + val doRespData_retToSrc_fwd = req.retToSrc.get && (isSnpToBFwd(req_chiOpcode) || isSnpToNFwd(req_chiOpcode)) + val doRespData_retToSrc_nonFwd = req.retToSrc.get && meta.state === BRANCH && (isSnpToBNonFwd(req_chiOpcode) || isSnpToNNonFwd(req_chiOpcode)) + val doRespData = Mux( + dirResult.hit, + doRespData_dirty || doRespData_retToSrc_fwd || doRespData_retToSrc_nonFwd, + req.snpHitRelease && req.snpHitReleaseWithData + ) + dontTouch(doRespData_dirty) + dontTouch(doRespData_retToSrc_fwd) + dontTouch(doRespData_retToSrc_nonFwd) + + /** + * About which snoop should echo SnpResp[Data]Fwded instead of SnpResp[Data]: + * 1. When the snoop opcode is Snp*Fwd and the snooped block is valid. + */ + val doFwd = isSnpXFwd(req_chiOpcode) && dirResult.hit + val doFwdHitRelease = isSnpXFwd(req_chiOpcode) && req.snpHitRelease && req.snpHitReleaseWithData + + val gotUD = meta.dirty & isT(meta.state) //TC/TTC -> UD + val promoteT_normal = dirResult.hit && meta_no_client && meta.state === TIP + val promoteT_L3 = !dirResult.hit && gotT + val promoteT_alias = dirResult.hit && req.aliasTask.getOrElse(false.B) && (meta.state === TRUNK || meta.state === TIP) + // under above circumstances, we grant T to L1 even if it wants B + val req_promoteT = (req_acquire || req_get || req_prefetch) && (promoteT_normal || promoteT_L3 || promoteT_alias) + + assert(!(req_valid && req_prefetch && dirResult.hit), "MSHR can not receive prefetch hit req") + + /* ======== Task allocation ======== */ + // Theoretically, data to be released is saved in ReleaseBuffer, so Acquire can be sent as soon as req enters mshr +// io.tasks.txreq.valid := !state.s_acquire || !state.s_reissue + io.tasks.txreq.valid := !state.s_acquire || + !state.s_reissue.getOrElse(false.B) && !state.w_grant && gotRetryAck && gotPCrdGrant + io.tasks.txrsp.valid := !state.s_compack.get && state.w_grantlast + io.tasks.source_b.valid := !state.s_pprobe || !state.s_rprobe + val mp_release_valid = !state.s_release && state.w_rprobeacklast && state.w_grantlast && state.w_replResp || + !state.s_reissue.getOrElse(false.B) && !state.w_releaseack && gotRetryAck && gotPCrdGrant + // release after Grant to L1 sent and replRead returns + val mp_cbwrdata_valid = !state.s_cbwrdata.getOrElse(true.B) && state.w_releaseack + val mp_probeack_valid = !state.s_probeack && state.w_pprobeacklast + val pending_grant_valid = !state.s_refill && state.w_grantlast && state.w_rprobeacklast + val mp_grant_valid = pending_grant_valid && (retryTimes < backoffThreshold.U || backoffTimer === backoffCycles.U) + val mp_dct_valid = !state.s_dct.getOrElse(true.B) && state.s_probeack + io.tasks.mainpipe.valid := + mp_release_valid || + mp_probeack_valid || + mp_grant_valid || + mp_cbwrdata_valid || + mp_dct_valid + // io.tasks.prefetchTrain.foreach(t => t.valid := !state.s_triggerprefetch.getOrElse(true.B)) + + when ( + pending_grant_valid && + backoffTimer < backoffCycles.U && + retryTimes === backoffThreshold.U + ) { + backoffTimer := backoffTimer + 1.U + } + + // resp and fwdState + val respCacheState = ParallelPriorityMux(Seq( + snpToN -> I, + snpToB -> SC, + (isSnpOnceX(req_chiOpcode) || isSnpStashX(req_chiOpcode)) -> + Mux(probeDirty || meta.dirty, UD, metaChi), + isSnpCleanShared(req_chiOpcode) -> Mux(isT(meta.state), UC, metaChi) + )) + val respPassDirty = (meta.dirty || probeDirty) && ( + snpToB || + req_chiOpcode === SnpUnique || + req_chiOpcode === SnpUniqueStash || + req_chiOpcode === SnpCleanShared || + req_chiOpcode === SnpCleanInvalid + ) + val fwdCacheState = Mux( + isSnpToBFwd(req_chiOpcode), + SC, + Mux( + req_chiOpcode === SnpUniqueFwd, + Mux(meta.dirty || probeDirty, UD, UC), + I + ) + ) + val fwdPassDirty = req_chiOpcode === SnpUniqueFwd && (meta.dirty || probeDirty) + + /*TXRSP for CompAck */ + val txrsp_task = { + val orsp = io.tasks.txrsp.bits + orsp := 0.U.asTypeOf(io.tasks.txrsp.bits.cloneType) + orsp.tgtID := Mux(req.opcode === AcquirePerm && req.param === NtoT, srcid, homenid) + orsp.srcID := 0.U + orsp.txnID := dbid + orsp.dbID := 0.U + orsp.opcode := CompAck +// orsp.resperr := 0.U + orsp.resp := 0.U + orsp.fwdState := 0.U +// orsp.stashhit := 0.U +// orsp.datapull :=0.U +// orsp.pcrdtype := 0.U +// orsp.tracetag := 0.U + } + + /*TXREQ for Transaction Request*/ + val a_task = { + val oa = io.tasks.txreq.bits + oa := 0.U.asTypeOf(io.tasks.txreq.bits.cloneType) +// oa.qos := Mux(!state.s_reissue, 3.U, 0.U) //TODO increase qos when retry + oa.tgtID := Mux(!state.s_reissue.getOrElse(false.B), srcid, 0.U) + oa.srcID := 0.U + oa.txnID := io.id + oa.returnNID := 0.U + oa.stashNID := 0.U + oa.stashNIDValid := false.B + /** + * TL CHI + * -------------------------------------------- + * Get | ReadNotSharedDirty + * AcquireBlock NtoB | ReadNotSharedDirty + * AcquireBlock NtoT | ReadUnique + * AcquirePerm NtoT | MakeUnique + * AcquirePerm BtoT | ReadUnique + * PrefetchRead | ReadNotSharedDirty + * PrefetchWrite | ReadUnique + */ + oa.opcode := ParallelPriorityMux(Seq( + (req.opcode === AcquirePerm && req.param === NtoT) -> MakeUnique, + req_needT -> ReadUnique, + req_needB /* Default */ -> ReadNotSharedDirty + )) + oa.size := log2Ceil(blockBytes).U + oa.addr := Cat(req.tag, req.set, 0.U(offsetBits.W)) //TODO 36bit -> 48bit + oa.ns := false.B + oa.likelyshared := false.B + oa.allowRetry := state.s_reissue.getOrElse(false.B) + oa.order := OrderEncodings.None + oa.pCrdType := Mux(!state.s_reissue.getOrElse(false.B), pcrdtype, 0.U) + oa.expCompAck := true.B + oa.memAttr := MemAttr(cacheable = true.B, allocate = true.B, device = false.B, ewa = true.B) + oa.snpAttr := true.B + oa.lpID := 0.U + oa.excl := false.B + oa.snoopMe := false.B + oa.traceTag := false.B + oa + } + + val b_task = { + val ob = io.tasks.source_b.bits + ob.tag := dirResult.tag + ob.set := dirResult.set + ob.off := 0.U + ob.opcode := Probe + ob.param := Mux( + !state.s_pprobe, + Mux( + snpToB, + toB, + Mux(snpToN, toN, toT) + ), + Mux( + req_get && dirResult.hit && meta.state === TRUNK, + toB, + toN + ) + ) + ob.alias.foreach(_ := meta.alias.getOrElse(0.U)) + ob + } + + val mp_release, mp_probeack, mp_grant, mp_cbwrdata, mp_dct = WireInit(0.U.asTypeOf(new TaskBundle)) + val mp_release_task = { + mp_release.channel := req.channel + mp_release.txChannel := CHIChannel.TXREQ + mp_release.tag := dirResult.tag + mp_release.set := req.set + mp_release.off := 0.U + mp_release.alias.foreach(_ := 0.U) + mp_release.vaddr.foreach(_ := 0.U) + mp_release.isKeyword.foreach(_ := false.B) + // if dirty, we must ReleaseData + // if accessed, we ReleaseData to keep the data in L3, for future access to be faster + // [Access] TODO: consider use a counter + mp_release.opcode := 0.U // use chiOpcode + mp_release.param := Mux(isT(meta.state), TtoN, BtoN) + mp_release.size := log2Ceil(blockBytes).U + mp_release.sourceId := 0.U(sourceIdBits.W) + mp_release.bufIdx := 0.U(bufIdxBits.W) + mp_release.needProbeAckData := false.B + mp_release.mshrTask := true.B + mp_release.mshrId := io.id + mp_release.aliasTask.foreach(_ := false.B) + // mp_release definitely read releaseBuf and refillBuf at ReqArb + // and it needs to write refillData to DS, so useProbeData is set false according to DS.wdata logic + mp_release.useProbeData := false.B + mp_release.mshrRetry := false.B + mp_release.way := dirResult.way + mp_release.fromL2pft.foreach(_ := false.B) + mp_release.needHint.foreach(_ := false.B) + mp_release.dirty := false.B//meta.dirty && meta.state =/= INVALID || probeDirty + mp_release.metaWen := false.B + mp_release.meta := MetaEntry() + mp_release.tagWen := false.B + mp_release.dsWen := true.B // write refillData to DS + mp_release.replTask := true.B + mp_release.wayMask := 0.U(cacheParams.ways.W) + mp_release.reqSource := 0.U(MemReqSource.reqSourceBits.W) + mp_release.mergeA := false.B + mp_release.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle) + + // CHI + val isWriteBackFull = isT(meta.state) && meta.dirty || probeDirty + mp_release.tgtID.get := 0.U + mp_release.srcID.get := 0.U + mp_release.txnID.get := io.id + mp_release.homeNID.get := 0.U + mp_release.dbID.get := 0.U + mp_release.chiOpcode.get := Mux(isWriteBackFull, WriteBackFull, Evict) + mp_release.resp.get := 0.U // DontCare + mp_release.fwdState.get := 0.U // DontCare + mp_release.pCrdType.get := 0.U // DontCare // TODO: consider retry of WriteBackFull/Evict + mp_release.retToSrc.get := req.retToSrc.get + mp_release.expCompAck.get := false.B + mp_release.allowRetry.get := state.s_reissue.getOrElse(false.B) + mp_release.memAttr.get := MemAttr(allocate = isWriteBackFull, cacheable = true.B, device = false.B, ewa = true.B) + mp_release + } + + val mp_cbwrdata_task = { + mp_cbwrdata.channel := req.channel + mp_cbwrdata.txChannel := CHIChannel.TXDAT + mp_cbwrdata.tag := dirResult.tag + mp_cbwrdata.set := req.set + mp_cbwrdata.off := 0.U + mp_cbwrdata.alias.foreach(_ := 0.U) + mp_cbwrdata.vaddr.foreach(_ := 0.U) + mp_cbwrdata.isKeyword.foreach(_ := false.B) + mp_cbwrdata.opcode := 0.U + mp_cbwrdata.param := 0.U + mp_cbwrdata.size := log2Ceil(blockBytes).U + mp_cbwrdata.sourceId := 0.U(sourceIdBits.W) + mp_cbwrdata.bufIdx := 0.U(bufIdxBits.W) + mp_cbwrdata.needProbeAckData := false.B + mp_cbwrdata.mshrTask := true.B + mp_cbwrdata.mshrId := io.id + mp_cbwrdata.aliasTask.foreach(_ := false.B) + mp_cbwrdata.useProbeData := false.B // DontCare + mp_cbwrdata.mshrRetry := false.B + mp_cbwrdata.way := dirResult.way + mp_cbwrdata.fromL2pft.foreach(_ := false.B) + mp_cbwrdata.needHint.foreach(_ := false.B) + mp_cbwrdata.dirty := false.B // DontCare + mp_cbwrdata.metaWen := false.B + mp_cbwrdata.meta := MetaEntry() + mp_cbwrdata.tagWen := false.B + mp_cbwrdata.dsWen := false.B + mp_cbwrdata.replTask := false.B + mp_cbwrdata.wayMask := 0.U + mp_cbwrdata.reqSource := 0.U + mp_cbwrdata.mergeA := false.B + mp_cbwrdata.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle) + + // CHI + mp_cbwrdata.tgtID.get := srcid + mp_cbwrdata.srcID.get := 0.U + mp_cbwrdata.txnID.get := dbid + mp_cbwrdata.homeNID.get := 0.U + mp_cbwrdata.dbID.get := 0.U + mp_cbwrdata.chiOpcode.get := CopyBackWrData + mp_cbwrdata.resp.get := Mux(isValid(meta.state), UD_PD, I) + mp_cbwrdata.fwdState.get := 0.U + mp_cbwrdata.pCrdType.get := 0.U // TODO + mp_cbwrdata.retToSrc.get := req.retToSrc.get // DontCare + mp_cbwrdata.expCompAck.get := false.B + mp_cbwrdata + } + + val mp_probeack_task = { + mp_probeack.channel := req.channel + mp_probeack.txChannel := Mux(doRespData, CHIChannel.TXDAT, CHIChannel.TXRSP) + mp_probeack.tag := req.tag + mp_probeack.set := req.set + mp_probeack.off := req.off + mp_probeack.alias.foreach(_ := 0.U) + mp_probeack.vaddr.foreach(_ := 0.U) + mp_probeack.isKeyword.foreach(_ := false.B) + mp_probeack.opcode := 0.U /* Mux( + meta.dirty && isT(meta.state) || probeDirty || req.needProbeAckData, + ProbeAckData, + ProbeAck + ) */ // DontCare + mp_probeack.param := DontCare + mp_probeack.size := log2Ceil(blockBytes).U + mp_probeack.sourceId := 0.U(sourceIdBits.W) + mp_probeack.bufIdx := 0.U(bufIdxBits.W) + mp_probeack.needProbeAckData := false.B + mp_probeack.mshrTask := true.B + mp_probeack.mshrId := io.id + mp_probeack.aliasTask.foreach(_ := false.B) + mp_probeack.useProbeData := true.B // write [probeAckData] to DS, if not probed toN // ??? + mp_probeack.mshrRetry := false.B + mp_probeack.way := dirResult.way + mp_probeack.fromL2pft.foreach(_ := false.B) + mp_probeack.needHint.foreach(_ := false.B) + mp_probeack.dirty := meta.dirty && meta.state =/= INVALID || probeDirty + mp_probeack.meta := MetaEntry( + /** + * Under what circumstances should the dirty bit be cleared: + * 1. If the snoop belongs to SnpToN + * 2. If the snoop belongs to SnpToB + * 3. If the snoop is SnpCleanShared + * Otherwise, the dirty bit should stay the same as before. + */ + dirty = !snpToN && !snpToB && req_chiOpcode =/= SnpCleanShared && meta.dirty, + state = Mux( + snpToN, + INVALID, + Mux(snpToB, BRANCH, meta.state) + ), + clients = Fill(clientBits, !probeGotN && !snpToN), + alias = meta.alias, //[Alias] Keep alias bits unchanged + prefetch = !snpToN && meta_pft, + accessed = !snpToN && meta.accessed + ) + mp_probeack.metaWen := !req.snpHitRelease + mp_probeack.tagWen := false.B + mp_probeack.dsWen := !snpToN && probeDirty + mp_probeack.wayMask := 0.U(cacheParams.ways.W) + mp_probeack.reqSource := 0.U(MemReqSource.reqSourceBits.W) + mp_probeack.replTask := false.B + mp_probeack.mergeA := false.B + mp_probeack.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle) + + // CHI + mp_probeack.tgtID.get := req.srcID.get + mp_probeack.srcID.get := 0.U + mp_probeack.txnID.get := req.txnID.get + mp_probeack.homeNID.get := 0.U + // For SnpRespData or SnpRespData, DBID is set to the same value as the TxnID of the snoop. + // For SnpRespDataFwded or SnpRespDataFwded, DBID is not defined and can be any value. + mp_probeack.dbID.get := req.txnID.getOrElse(0.U) + mp_probeack.chiOpcode.get := MuxLookup( + Cat(doFwd || doFwdHitRelease, doRespData), + SnpResp + )(Seq( + Cat(false.B, false.B) -> SnpResp, + Cat(true.B, false.B) -> SnpRespFwded, + Cat(false.B, true.B) -> SnpRespData, // ignore SnpRespDataPtl for now + Cat(true.B, true.B) -> SnpRespDataFwded + )) + mp_probeack.resp.get := Mux( + req.snpHitRelease && req.snpHitReleaseWithData, + I_PD, + setPD(respCacheState, respPassDirty && doRespData) + ) + mp_probeack.fwdState.get := setPD(fwdCacheState, fwdPassDirty) + mp_probeack.pCrdType.get := 0.U + mp_probeack.retToSrc.get := req.retToSrc.get // DontCare + mp_probeack.expCompAck.get := false.B + mp_probeack.snpHitRelease := req.snpHitRelease + mp_probeack.snpHitReleaseWithData := req.snpHitReleaseWithData + mp_probeack.snpHitReleaseIdx := req.snpHitReleaseIdx + + mp_probeack + } + + + val mergeA = RegInit(false.B) + when(io.aMergeTask.valid) { + mergeA := true.B + }.elsewhen(io.alloc.valid) { + mergeA := false.B + } + val mp_grant_task = { + mp_grant.channel := req.channel + mp_grant.tag := req.tag + mp_grant.set := req.set + mp_grant.off := req.off + mp_grant.sourceId := req.sourceId + mp_grant.alias.foreach(_ := 0.U) + mp_grant.vaddr.foreach(_ := 0.U) + mp_grant.isKeyword.foreach(_ := req.isKeyword.getOrElse(false.B)) + mp_grant.opcode := odOpGen(req.opcode) + mp_grant.param := Mux( + req_get || req_prefetch, + 0.U, // Get -> AccessAckData + MuxLookup( // Acquire -> Grant + req.param, + req.param)( + Seq( + NtoB -> Mux(req_promoteT, toT, toB), + BtoT -> toT, + NtoT -> toT + ) + ) + ) + mp_grant.size := 0.U(msgSizeBits.W) + mp_grant.bufIdx := 0.U(bufIdxBits.W) + mp_grant.needProbeAckData := false.B + mp_grant.mshrTask := true.B + mp_grant.mshrId := io.id + mp_grant.way := dirResult.way + // if it is a Get or Prefetch, then we must keep alias bits unchanged + // in case future probes gets the wrong alias bits + val aliasFinal = Mux(req_get || req_prefetch, meta.alias.getOrElse(0.U), req.alias.getOrElse(0.U)) + mp_grant.alias.foreach(_ := aliasFinal) + mp_grant.aliasTask.foreach(_ := req.aliasTask.getOrElse(false.B)) + // [Alias] write probeData into DS for alias-caused Probe, but not replacement-caused Probe + // Exception case when multi-core: if aliasTask is AcquireBlock NtoT and self_state is Branch, + // and there is a nested Probe toN from L3 (means the data Granted from L3 in the future may be a new data), + // useProbeData will be set false to use data in RefillBuffer + mp_grant.useProbeData := (dirResult.hit && req_get) || + (req.aliasTask.getOrElse(false.B) && + !(dirResult.meta.state === BRANCH && req_needT) + ) + mp_grant.dirty := false.B + + mp_grant.meta := MetaEntry( + dirty = gotDirty || dirResult.hit && (meta.dirty || probeDirty), + state = Mux( + req_get, + Mux( // Get + dirResult.hit, + Mux(isT(meta.state), TIP, BRANCH), + Mux(req_promoteT, TIP, BRANCH) + ), + Mux( // Acquire + req_promoteT || req_needT, + Mux(req_prefetch, TIP, TRUNK), + BRANCH + ) + ), + clients = Mux( + req_prefetch, + Mux(dirResult.hit, meta.clients, Fill(clientBits, false.B)), + Fill(clientBits, !(req_get && (!dirResult.hit || meta_no_client || probeGotN))) + ), + alias = Some(aliasFinal), + prefetch = req_prefetch || dirResult.hit && meta_pft, + pfsrc = PfSource.fromMemReqSource(req.reqSource), + accessed = req_acquire || req_get + ) + mp_grant.metaWen := true.B + mp_grant.tagWen := !dirResult.hit + mp_grant.dsWen := gotGrantData || probeDirty && (req_get || req.aliasTask.getOrElse(false.B)) + mp_grant.fromL2pft.foreach(_ := req.fromL2pft.get) + mp_grant.needHint.foreach(_ := false.B) + mp_grant.replTask := !dirResult.hit && !state.w_replResp + mp_grant.wayMask := 0.U(cacheParams.ways.W) + mp_grant.mshrRetry := !state.s_retry + mp_grant.reqSource := 0.U(MemReqSource.reqSourceBits.W) + + // Add merge grant task for Acquire and late Prefetch + mp_grant.mergeA := mergeA || io.aMergeTask.valid + val merge_task_r = RegEnable(io.aMergeTask.bits, 0.U.asTypeOf(new TaskBundle), io.aMergeTask.valid) + val merge_task = Mux(io.aMergeTask.valid, io.aMergeTask.bits, merge_task_r) + val merge_task_isKeyword = Mux(io.aMergeTask.valid, io.aMergeTask.bits.isKeyword.getOrElse(false.B), merge_task_r.isKeyword.getOrElse(false.B) ) + + mp_grant.aMergeTask.off := merge_task.off + mp_grant.aMergeTask.alias.foreach(_ := merge_task.alias.getOrElse(0.U)) + mp_grant.aMergeTask.vaddr.foreach(_ := merge_task.vaddr.getOrElse(0.U)) + mp_grant.aMergeTask.isKeyword.foreach(_ := merge_task_isKeyword) + mp_grant.aMergeTask.opcode := odOpGen(merge_task.opcode) + mp_grant.aMergeTask.param := MuxLookup( // Acquire -> Grant + merge_task.param, + merge_task.param)( + Seq( + NtoB -> Mux(req_promoteT, toT, toB), + BtoT -> toT, + NtoT -> toT + ) + ) + mp_grant.aMergeTask.sourceId := merge_task.sourceId + mp_grant.aMergeTask.meta := MetaEntry( + dirty = gotDirty || dirResult.hit && (meta.dirty || probeDirty), + state = Mux( // Acquire + req_promoteT || needT(merge_task.opcode, merge_task.param), + TRUNK, + BRANCH + ), + clients = Fill(clientBits, true.B), + alias = Some(merge_task.alias.getOrElse(0.U)), + prefetch = false.B, + accessed = true.B + ) + + mp_grant + } + + val mp_dct_task = { + mp_dct.channel := req.channel + mp_dct.txChannel := CHIChannel.TXDAT + mp_dct.tag := req.tag + mp_dct.set := req.set + mp_dct.off := req.off + mp_dct.alias.foreach(_ := 0.U) + mp_dct.vaddr.foreach(_ := 0.U) + mp_dct.isKeyword.foreach(_ := 0.U) + mp_dct.opcode := 0.U // DontCare + mp_dct.param := 0.U // DontCare + mp_dct.size := log2Ceil(blockBytes).U + mp_dct.sourceId := 0.U(sourceIdBits.W) + mp_dct.bufIdx := 0.U(sourceIdBits.W) + mp_dct.needProbeAckData := false.B + mp_dct.mshrTask := true.B + mp_dct.mshrId := io.id + mp_dct.aliasTask.foreach(_ := false.B) + mp_dct.useProbeData := true.B + mp_dct.mshrRetry := false.B + mp_dct.way := dirResult.way + mp_dct.fromL2pft.foreach(_ := false.B) + mp_dct.needHint.foreach(_ := false.B) + mp_dct.dirty := meta.dirty && meta.state =/= INVALID || probeDirty + mp_dct.meta := MetaEntry() + mp_dct.metaWen := false.B // meta is written by SnpResp[Data]Fwded, not CompData + mp_dct.tagWen := false.B + mp_dct.dsWen := false.B + mp_dct.wayMask := 0.U(cacheParams.ways.W) + mp_dct.reqSource := 0.U(MemReqSource.reqSourceBits.W) + mp_dct.replTask := false.B + mp_dct.mergeA := false.B + mp_dct.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle) + + // CHI + mp_dct.tgtID.get := req.fwdNID.get + mp_dct.srcID.get := 0.U + mp_dct.txnID.get := req.fwdTxnID.get + mp_dct.homeNID.get := req.srcID.get + mp_dct.chiOpcode.get := CompData + mp_dct.resp.get := setPD(fwdCacheState, fwdPassDirty) + mp_dct.fwdState.get := 0.U + mp_dct.pCrdType.get := 0.U // DontCare + mp_dct.retToSrc.get := false.B // DontCare + mp_dct.expCompAck.get := false.B // DontCare + mp_dct.snpHitRelease := req.snpHitRelease + mp_dct.snpHitReleaseWithData := req.snpHitReleaseWithData + mp_dct.snpHitReleaseIdx := req.snpHitReleaseIdx + + mp_dct + } + io.tasks.mainpipe.bits := ParallelPriorityMux( + Seq( + mp_grant_valid -> mp_grant, + mp_release_valid -> mp_release, + mp_cbwrdata_valid -> mp_cbwrdata, + mp_probeack_valid -> mp_probeack, + mp_dct_valid -> mp_dct + ) + ) + io.tasks.mainpipe.bits.reqSource := req.reqSource + io.tasks.mainpipe.bits.isKeyword.foreach(_:= req.isKeyword.getOrElse(false.B)) + // io.tasks.prefetchTrain.foreach { + // train => + // train.bits.tag := req.tag + // train.bits.set := req.set + // train.bits.needT := req_needT + // train.bits.source := req.source + // } + + /* ======== Task update ======== */ + when (io.tasks.txreq.fire) { + state.s_acquire := true.B + state.s_reissue.get := true.B + } + when (io.tasks.txrsp.fire) { + state.s_compack.get := true.B + } + when (io.tasks.source_b.fire) { + state.s_pprobe := true.B + state.s_rprobe := true.B + } + when (io.tasks.mainpipe.ready) { + when (mp_grant_valid) { + state.s_refill := true.B + state.s_retry := true.B + }.elsewhen (mp_release_valid) { + state.s_release := true.B + state.s_reissue.get := true.B + state.s_cbwrdata.get := !(isT(meta.state) && meta.dirty || probeDirty) + // meta.state := INVALID + }.elsewhen (mp_cbwrdata_valid) { + state.s_cbwrdata.get := true.B + meta.state := INVALID + }.elsewhen (mp_probeack_valid) { + state.s_probeack := true.B + }.elsewhen (mp_dct_valid) { + state.s_dct.get := true.B + } + } + + /* Handling response + + TL CHI CHI Resp CHI channel + ----------------------------------------------------------------------------- + AcquireBlock | ReadNotShareDirty | CompData | rxdat + AcquirePerm(miss) | ReadUnique | CompData | rxdat + AcquirePerm(hit B) | MakeUnique | Comp | rxrsp <- TODO + Get | ReadClean | CompData | rxdat + Hint | ReadNotShareDirty | CompData | rxdat + Release | WriteBackFull | CompDBID | rxrsp + | * | RetryAck+PCrdGrant | rxrsp <- + */ + val c_resp = io.resps.sinkC + val rxrsp = io.resps.rxrsp + val rxdat = io.resps.rxdat + // Probe core response + when (c_resp.valid) { + when (c_resp.bits.opcode === ProbeAck || c_resp.bits.opcode === ProbeAckData) { + state.w_rprobeackfirst := true.B + state.w_rprobeacklast := state.w_rprobeacklast || c_resp.bits.last + state.w_pprobeackfirst := true.B + state.w_pprobeacklast := state.w_pprobeacklast || c_resp.bits.last + state.w_pprobeack := state.w_pprobeack || req.off === 0.U || c_resp.bits.last + } + when (c_resp.bits.opcode === ProbeAckData) { + probeDirty := true.B + } + when (isToN(c_resp.bits.param)) { + probeGotN := true.B + } + } + + val rxdatIsU = rxdat.bits.resp.get === UC + val rxdatIsU_PD = rxdat.bits.resp.get === UC_PD + + val rxrspIsU = rxrsp.bits.resp.get === UC + + // RXDAT + when (rxdat.valid) { + when (rxdat.bits.chiOpcode.get === CompData) { + require(beatSize == 2) // TODO: This is ugly + state.w_grantfirst := true.B + state.w_grantlast := state.w_grantfirst + state.w_grant := req.off === 0.U || state.w_grantfirst // TODO? why offset? + gotT := rxdatIsU || rxdatIsU_PD + gotDirty := gotDirty || rxdatIsU_PD + gotGrantData := true.B + dbid := rxdat.bits.dbID.getOrElse(0.U) + homenid := rxdat.bits.homeNID.getOrElse(0.U) + } + } + + // RXRSP for dataless + when (rxrsp.valid) { + when (rxrsp.bits.chiOpcode.get === Comp) { + // There is a pending Read transaction waiting for the Comp resp + when (!state.w_grant) { + state.w_grantfirst := true.B + state.w_grantlast := rxrsp.bits.last + state.w_grant := req.off === 0.U || rxrsp.bits.last // TODO? why offset? + gotT := rxrspIsU + gotDirty := false.B + } + + // There is a pending Evict transaction waiting for the Comp resp + when (!state.w_releaseack) { + state.w_releaseack := true.B + } + + // Comp for Dataless transaction that include CompAck + // Use DBID as a identifier for CompAck + dbid := rxrsp.bits.dbID.getOrElse(0.U) + srcid := rxrsp.bits.srcID.getOrElse(0.U) + } + when(rxrsp.bits.chiOpcode.get === CompDBIDResp) { + state.w_releaseack := true.B + srcid := rxrsp.bits.srcID.getOrElse(0.U) + dbid := rxrsp.bits.dbID.getOrElse(0.U) + } + when(rxrsp.bits.chiOpcode.get === RetryAck) { + srcid := rxrsp.bits.srcID.getOrElse(0.U) + pcrdtype := rxrsp.bits.pCrdType.getOrElse(0.U) + gotRetryAck := true.B + gotReissued := false.B + } + when((rxrsp.bits.chiOpcode.get === PCrdGrant) && !gotReissued) { + state.s_reissue.get := false.B + gotPCrdGrant := true.B + gotReissued := true.B + } + } + // when there is this type of pCredit in pCam -> reissue + when (io.pCamPri) { + state.s_reissue.get := false.B + gotPCrdGrant := true.B + gotReissued := true.B + } + + // replay + val replResp = io.replResp.bits + when (io.replResp.valid && replResp.retry) { + state.s_refill := false.B + state.s_retry := false.B + dirResult.way := replResp.way + when (retryTimes < backoffThreshold.U) { + retryTimes := retryTimes + 1.U + } + backoffTimer := 0.U + } + when (io.replResp.valid && !replResp.retry) { + state.w_replResp := true.B + + // update meta (no need to update hit/set/error/replacerInfo of dirResult) + dirResult.tag := replResp.tag + dirResult.way := replResp.way + dirResult.meta := replResp.meta + + // replacer choosing: + // 1. an invalid way, release no longer needed + // 2. the same way, just release as normal (only now we set s_release) + // 3. differet way, we need to update meta and release that way + // if meta has client, rprobe client + when (replResp.meta.state =/= INVALID) { + // set release flags + state.s_release := false.B + state.w_releaseack := false.B + // rprobe clients if any + when (replResp.meta.clients.orR) { + state.s_rprobe := false.B + state.w_rprobeackfirst := false.B + state.w_rprobeacklast := false.B + } + } + } + + when (req_valid) { + timer := timer + 1.U + } + + val no_schedule = state.s_refill && state.s_probeack && state.s_release && + state.s_compack.getOrElse(true.B) && + state.s_cbwrdata.getOrElse(true.B) && + state.s_reissue.getOrElse(true.B) && + state.s_dct.getOrElse(true.B) + val no_wait = state.w_rprobeacklast && state.w_pprobeacklast && state.w_grantlast && state.w_releaseack && state.w_replResp + val will_free = no_schedule && no_wait + when (will_free && req_valid) { + req_valid := false.B + timer := 0.U + } + + // alias: should protect meta from being accessed or occupied + val releaseNotSent = !state.s_release + io.status.valid := req_valid + io.status.bits.channel := req.channel + io.status.bits.txChannel := req.txChannel // TODO + io.status.bits.set := req.set + io.status.bits.reqTag := req.tag + io.status.bits.metaTag := dirResult.tag + io.status.bits.needsRepl := releaseNotSent + // wait for resps, high as valid + io.status.bits.w_c_resp := !state.w_rprobeacklast || !state.w_pprobeacklast || !state.w_pprobeack + io.status.bits.w_d_resp := !state.w_grantlast || !state.w_grant || !state.w_releaseack + io.status.bits.will_free := will_free + io.status.bits.is_miss := !dirResult.hit + io.status.bits.is_prefetch := req_prefetch + io.status.bits.reqSource := req.reqSource + + io.msInfo.valid := req_valid + io.msInfo.bits.set := req.set + io.msInfo.bits.way := dirResult.way + io.msInfo.bits.reqTag := req.tag + io.msInfo.bits.aliasTask.foreach(_ := req.aliasTask.getOrElse(false.B)) + io.msInfo.bits.needRelease := !state.w_releaseack + // if releaseTask is already in mainpipe_s1/s2, while a refillTask in mainpipe_s3, the refill should also be blocked and retry + io.msInfo.bits.blockRefill := releaseNotSent || RegNext(releaseNotSent,false.B) || RegNext(RegNext(releaseNotSent,false.B),false.B) + io.msInfo.bits.dirHit := dirResult.hit + io.msInfo.bits.metaTag := dirResult.tag + io.msInfo.bits.metaState := meta.state + io.msInfo.bits.willFree := will_free + io.msInfo.bits.isAcqOrPrefetch := req_acquire || req_prefetch + io.msInfo.bits.isPrefetch := req_prefetch + io.msInfo.bits.param := req.param + io.msInfo.bits.mergeA := mergeA + io.msInfo.bits.w_grantfirst := state.w_grantfirst + io.msInfo.bits.s_refill := state.s_refill + io.msInfo.bits.w_releaseack := state.w_releaseack + io.msInfo.bits.w_replResp := state.w_replResp + io.msInfo.bits.w_rprobeacklast := state.w_rprobeacklast + io.msInfo.bits.replaceData := isT(meta.state) && meta.dirty || probeDirty + io.msInfo.bits.channel := req.channel + + assert(!(c_resp.valid && !io.status.bits.w_c_resp)) + assert(!(rxrsp.valid && !io.status.bits.w_d_resp)) + + /* ======== Handling Nested C ======== */ + // for A miss, only when replResp do we finally choose a way, allowing nested C + // for A-alias, always allowing nested C (state.w_replResp === true.B) + val nestedwb_match = req_valid && meta.state =/= INVALID && + dirResult.set === io.nestedwb.set && + dirResult.tag === io.nestedwb.tag && + state.w_replResp + val nestedwb_hit_match = req_valid && dirResult.hit && + dirResult.set === io.nestedwb.set && + dirResult.tag === io.nestedwb.tag + + when (nestedwb_match) { + when (io.nestedwb.c_set_dirty) { + meta.dirty := true.B + } + when (io.nestedwb.b_inv_dirty) { + meta.dirty := false.B + meta.state := INVALID + } + } + when (nestedwb_hit_match) { + when (io.nestedwb.b_toB.get) { + meta.state := Mux(meta.state >= BRANCH, BRANCH, INVALID) + meta.dirty := false.B + } + when (io.nestedwb.b_toN.get) { + meta.state := INVALID + dirResult.hit := false.B + meta.dirty := false.B + meta.clients := Fill(clientBits, false.B) + state.w_replResp := false.B + req.aliasTask.foreach(_ := false.B) + } + } + // let nested C write ReleaseData to the MSHRBuffer entry of this MSHR id + // This is the VALID signal for releaseBuf.io.w(2) + io.nestedwbData := nestedwb_match && io.nestedwb.c_set_dirty + + dontTouch(state) + + + // + // deadlock check + // + val validCnt = RegInit(0.U(64.W)) + when(io.alloc.valid) { + validCnt := 0.U + } + + when(req_valid) { + validCnt := validCnt + 1.U + } + + val mshrAddr = Cat(req.tag, req.set, 0.U(6.W)) // TODO: consider multibank + val VALID_CNT_MAX = 200000.U + assert(validCnt <= VALID_CNT_MAX, "validCnt full!, maybe there is a deadlock! addr => 0x%x req_opcode => %d channel => 0b%b", mshrAddr, req.opcode, req.channel) + + + /* ======== Performance counters ======== */ + // time stamp + // if (cacheParams.enablePerf) { + val acquire_ts = RegEnable(timer, false.B, io.tasks.txreq.fire) + val probe_ts = RegEnable(timer, false.B, io.tasks.source_b.fire) + val release_ts = RegEnable(timer, false.B, !mp_grant_valid && mp_release_valid && io.tasks.mainpipe.ready) + val acquire_period = IO(Output(UInt(64.W))) + val probe_period = IO(Output(UInt(64.W))) + val release_period = IO(Output(UInt(64.W))) + acquire_period := timer - acquire_ts + probe_period := timer - probe_ts + release_period := timer - release_ts + // } +} diff --git a/src/main/scala/coupledL2/tl2chi/MSHRCtl.scala b/src/main/scala/coupledL2/tl2chi/MSHRCtl.scala new file mode 100644 index 00000000..b0fb145a --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/MSHRCtl.scala @@ -0,0 +1,308 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import chisel3.util.random.LFSR +import utility._ +import org.chipsalliance.cde.config.Parameters +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.tilelink.TLMessages._ +import coupledL2.prefetch.PrefetchTrain +import coupledL2.utils.{XSPerfAccumulate, XSPerfHistogram, XSPerfMax} +import coupledL2._ +import tl2chi.{HasCHIMsgParameters} +import coupledL2.tl2chi.CHIOpcode.RSPOpcodes._ + +// PCrd info for MSHR Retry +class PCrdInfo(implicit p: Parameters) extends TL2CHIL2Bundle +{ + val valid = Bool() + val srcID = chiOpt.map(_ => UInt(SRCID_WIDTH.W)) + val pCrdType = chiOpt.map(_ => UInt(PCRDTYPE_WIDTH.W)) +} + +class MSHRCtl(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + /* interact with req arb */ + val fromReqArb = Input(new Bundle() { + val status_s1 = new PipeEntranceStatus() + }) + val toReqArb = Output(new BlockInfo()) + + /* interact with mainpipe */ + val fromMainPipe = new Bundle() { + val mshr_alloc_s3 = Flipped(ValidIO(new MSHRRequest())) + } + val toMainPipe = new Bundle() { + val mshr_alloc_ptr = Output(UInt(mshrBits.W)) + } + + /* to request arbiter */ + // val mshrFull = Output(Bool()) + val mshrTask = DecoupledIO(new TaskBundle()) + + /* status of s2 and s3 */ + val pipeStatusVec = Flipped(Vec(2, ValidIO(new PipeStatus))) + + /* send reqs */ + val toTXREQ = DecoupledIO(new CHIREQ()) + val toTXRSP = DecoupledIO(new CHIRSP()) // TODO: unify with main pipe, which should be TaskBundle + val toSourceB = DecoupledIO(new TLBundleB(edgeIn.bundle)) + + /* to block sourceB from sending same-addr probe until GrantAck received */ + val grantStatus = Input(Vec(grantBufInflightSize, new GrantStatus())) + + /* receive resps */ + val resps = Input(new Bundle() { + val sinkC = new RespBundle() //probeAck from core + val rxrsp = new RespBundle() //releaseAck(CompDBID) from CHI + val rxdat = new RespBundle() //AcquireBlock(CompData) from CHI + }) + + val releaseBufWriteId = Output(UInt(mshrBits.W)) + + /* nested writeback */ + val nestedwb = Input(new NestedWriteback) + val nestedwbDataId = Output(ValidIO(UInt(mshrBits.W))) + + /* MSHR info to Sinks */ + val msInfo = Vec(mshrsAll, ValidIO(new MSHRInfo())) + val aMergeTask = Flipped(ValidIO(new AMergeTask)) + + /* refill read replacer result */ + val replResp = Flipped(ValidIO(new ReplacerResult)) + + /* for TopDown Monitor */ + val msStatus = topDownOpt.map(_ => Vec(mshrsAll, ValidIO(new MSHRStatus))) + + /* to Slice Top for pCrd info.*/ + val waitPCrdInfo = Output(Vec(mshrsAll, new PCrdInfo)) +}) + + /*MSHR allocation pointer gen -> to Mainpipe*/ + class MSHRSelector(implicit p: Parameters) extends L2Module { + val io = IO(new Bundle() { + val idle = Input(Vec(mshrsAll, Bool())) + val out = ValidIO(UInt(mshrsAll.W)) + }) + io.out.valid := ParallelOR(io.idle) + io.out.bits := ParallelPriorityMux(io.idle.zipWithIndex.map { + case (b, i) => (b, (1 << i).U) + }) + } + + val mshrs = Seq.fill(mshrsAll) { Module(new MSHR()) } + val mshrValids = VecInit(mshrs.map(m => m.io.status.valid)) + val pipeReqCount = PopCount(Cat(io.pipeStatusVec.map(_.valid))) // TODO: consider add !mshrTask to optimize + val mshrCount = PopCount(Cat(mshrs.map(_.io.status.valid))) + val mshrFull = pipeReqCount + mshrCount >= mshrsAll.U + val a_mshrFull = pipeReqCount + mshrCount >= (mshrsAll-1).U // the last idle mshr should not be allocated for channel A req + val mshrSelector = Module(new MSHRSelector()) + val selectedMSHROH = mshrSelector.io.out.bits + + mshrSelector.io.idle := mshrs.map(m => !m.io.status.valid) + io.toMainPipe.mshr_alloc_ptr := OHToUInt(selectedMSHROH) + + /* + when PCrdGrant, give credit to one entry that: + 1. got RetryAck and not Reissued + 2. match srcID and PCrdType + 3. use Round-Robin arbiter if multi-entry match + */ + val isPCrdGrant = io.resps.rxrsp.valid && (io.resps.rxrsp.respInfo.chiOpcode.get === PCrdGrant) + val waitPCrdInfo = Wire(Vec(mshrsAll, new PCrdInfo)) +// val pArb = Module(new RRArbiter(UInt(), mshrsAll)) + + val matchPCrdGrant = VecInit(waitPCrdInfo.map(p => + isPCrdGrant && p.valid && + p.srcID.get === io.resps.rxrsp.respInfo.srcID.get && + p.pCrdType.get === io.resps.rxrsp.respInfo.pCrdType.get + )) + +/* pArb.io.in.zipWithIndex.foreach { + case (in, i) => + in.valid := matchPCrdGrant(i) + in.bits := 0.U + } + pArb.io.out.ready := true.B + val pCrdRR = VecInit(UIntToOH(pArb.io.chosen)) + val pCrdPri = VecInit((matchPCrdGrant.asUInt & pCrdRR.asUInt).asBools) +//val pCrdPri = VecInit(PriorityEncoderOH(matchPCrdGrant)) + val pCrdIsWait = OHToUInt(pCrdPri) + */ + + /* + Random arbiter if multi-entry match + */ + val lfsr = LFSR(16, true.B) + val idx = Random(16, lfsr) + val idxOH = VecInit(UIntToOH(idx)) + + val doubleReq = Fill(2, matchPCrdGrant.asUInt) + val doubleGnt = ~(doubleReq - idxOH.asUInt) & doubleReq + val gnt = doubleGnt(31,16) | doubleGnt(15,0) + val pCrdPri = VecInit(gnt.asBools) + val pCrdIsWait = OHToUInt(pCrdPri) + + /* when PCrdGrant come before RetryAck, 16 entry CAM used to: + 1. save {srcID, PCrdType} + 2. Broadcast to each MSHR for seaching when RetryAck + */ +// val pCamValids = RegInit(VecInit(Seq.fill(mshrsAll){ false.B })) + val pCam = RegInit(VecInit(Seq.fill(mshrsAll)(0.U.asTypeOf(new PCrdInfo)))) + val pCamPri = Wire(UInt(5.W)) + val pCamValids = Cat(pCam.map(_.valid)) + val enqIdx = PriorityEncoder(~pCamValids.asUInt) + + when (isPCrdGrant && !pCrdIsWait.orR){ + pCam(enqIdx).valid := true.B + pCam(enqIdx).srcID.get := io.resps.rxrsp.respInfo.srcID.get + pCam(enqIdx).pCrdType.get := io.resps.rxrsp.respInfo.pCrdType.get + } + + pCamPri := 16.U //out of range of mshrAll + + //each entry zip pCam + for (i <- 0 until mshrsAll) { //entry + when (waitPCrdInfo(i).valid) { + for (j <- 0 until mshrsAll) { //pCam + when (pCam(j).valid && + waitPCrdInfo(i).srcID.get === pCam(j).srcID.get && + waitPCrdInfo(i).srcID.get === pCam(j).pCrdType.get) { + pCam(j).valid := false.B + pCamPri := i.U + } + } + } + } + + /* SinkC(release) search MSHR with PA */ + val resp_sinkC_match_vec = mshrs.map { mshr => + val status = mshr.io.status.bits + val tag = Mux(status.needsRepl, status.metaTag, status.reqTag) + mshr.io.status.valid && status.w_c_resp && io.resps.sinkC.set === status.set && io.resps.sinkC.tag === tag + } + + /* Port connection of MSHR entry */ + mshrs.zipWithIndex.foreach { + case (m, i) => + m.io.id := i.U + m.io.alloc.valid := selectedMSHROH(i) && io.fromMainPipe.mshr_alloc_s3.valid + m.io.alloc.bits := io.fromMainPipe.mshr_alloc_s3.bits + m.io.alloc.bits.task.isKeyword.foreach(_:= io.fromMainPipe.mshr_alloc_s3.bits.task.isKeyword.getOrElse(false.B)) + + m.io.resps.sinkC.valid := io.resps.sinkC.valid && resp_sinkC_match_vec(i) + m.io.resps.sinkC.bits := io.resps.sinkC.respInfo + + m.io.resps.rxdat.valid := m.io.status.valid && io.resps.rxdat.valid && io.resps.rxdat.mshrId === i.U + m.io.resps.rxdat.bits := io.resps.rxdat.respInfo + + m.io.resps.rxrsp.valid := (m.io.status.valid && io.resps.rxrsp.valid && !isPCrdGrant && io.resps.rxrsp.mshrId === i.U) || (isPCrdGrant && pCrdPri(i)) + m.io.resps.rxrsp.bits := io.resps.rxrsp.respInfo + + m.io.replResp.valid := io.replResp.valid && io.replResp.bits.mshrId === i.U + m.io.replResp.bits := io.replResp.bits + + io.msInfo(i) := m.io.msInfo + m.io.nestedwb := io.nestedwb + m.io.aMergeTask.valid := io.aMergeTask.valid && io.aMergeTask.bits.id === i.U + m.io.aMergeTask.bits := io.aMergeTask.bits.task + + waitPCrdInfo(i) := m.io.waitPCrdInfo + m.io.pCamPri := (pCamPri === i.U) && waitPCrdInfo(i).valid + } + /* Reserve 1 entry for SinkB */ + io.waitPCrdInfo <> waitPCrdInfo + + /* Reserve 1 entry for SinkB */ + io.toReqArb.blockC_s1 := false.B + io.toReqArb.blockB_s1 := mshrFull // conflict logic in SinkB + io.toReqArb.blockA_s1 := a_mshrFull // conflict logic in ReqBuf + io.toReqArb.blockG_s1 := false.B + + /* Acquire downwards to TXREQ*/ + fastArb(mshrs.map(_.io.tasks.txreq), io.toTXREQ, Some("txreq")) + + /* Response downwards to TXRSP*/ + fastArb(mshrs.map(_.io.tasks.txrsp), io.toTXRSP, Some("txrsp")) + + /* Probe upwards */ + val sourceB = Module(new SourceB()) + fastArb(mshrs.map(_.io.tasks.source_b), sourceB.io.task, Some("source_b")) + sourceB.io.grantStatus := io.grantStatus + io.toSourceB <> sourceB.io.sourceB + + /* Arbitrate MSHR task to RequestArbiter */ + fastArb(mshrs.map(_.io.tasks.mainpipe), io.mshrTask, Some("mshr_task")) + + /* releaseBuf link to MSHR id */ + io.releaseBufWriteId := ParallelPriorityMux(resp_sinkC_match_vec, (0 until mshrsAll).map(i => i.U)) + + /* Nest writeback check */ + io.nestedwbDataId.valid := Cat(mshrs.map(_.io.nestedwbData)).orR + io.nestedwbDataId.bits := ParallelPriorityMux(mshrs.zipWithIndex.map { + case (mshr, i) => (mshr.io.nestedwbData, i.U) + }) + assert(RegNext(PopCount(mshrs.map(_.io.nestedwbData)) <= 1.U), "should only be one nestedwbData") + + + /* Status for topDown monitor */ + topDownOpt.foreach (_ => + io.msStatus.get.zip(mshrs).foreach { + case (in, s) => in := s.io.status + } + ) + /* Performance counters */ +/* XSPerfAccumulate(cacheParams, "capacity_conflict_to_sinkA", a_mshrFull) + XSPerfAccumulate(cacheParams, "capacity_conflict_to_sinkB", mshrFull) + XSPerfHistogram(cacheParams, "mshr_alloc", io.toMainPipe.mshr_alloc_ptr, + enable = io.fromMainPipe.mshr_alloc_s3.valid, + start = 0, stop = mshrsAll, step = 1) + if (cacheParams.enablePerf) { + val start = 0 + val stop = 100 + val step = 5 + val acquire_period = ParallelMux(mshrs.map { case m => m.io.resps.sink_d.valid -> m.acquire_period }) + val release_period = ParallelMux(mshrs.map { case m => m.io.resps.sink_d.valid -> m.release_period }) + val probe_period = ParallelMux(mshrs.map { case m => m.io.resps.sink_c.valid -> m.probe_period }) + val acquire_period_en = io.resps.rxdat.valid && + (io.resps.rxdat.respInfo.opcode === Grant || io.resps.rxdat.respInfo.opcode === GrantData) + val release_period_en = io.resps.rxdat.valid && io.resps.rxdat.respInfo.opcode === ReleaseAck + val probe_period_en = io.resps.sinkC.valid && + (io.resps.sinkC.respInfo.opcode === ProbeAck || io.resps.sinkC.respInfo.opcode === ProbeAckData) + XSPerfHistogram(cacheParams, "acquire_period", acquire_period, acquire_period_en, start, stop, step) + XSPerfHistogram(cacheParams, "release_period", release_period, release_period_en, start, stop, step) + XSPerfHistogram(cacheParams, "probe_period", probe_period, probe_period_en, start, stop, step) + + val timers = RegInit(VecInit(Seq.fill(mshrsAll)(0.U(64.W)))) + for (((timer, m), i) <- timers.zip(mshrs).zipWithIndex) { + when (m.io.alloc.valid) { + timer := 1.U + }.otherwise { + timer := timer + 1.U + } + val enable = m.io.status.valid && m.io.status.bits.will_free + XSPerfHistogram(cacheParams, "mshr_latency_" + Integer.toString(i, 10), + timer, enable, 0, 300, 10) + XSPerfMax(cacheParams, "mshr_latency", timer, enable) + } + }*/ +} + diff --git a/src/main/scala/coupledL2/tl2chi/MainPipe.scala b/src/main/scala/coupledL2/tl2chi/MainPipe.scala new file mode 100644 index 00000000..020e0b76 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/MainPipe.scala @@ -0,0 +1,895 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import utility._ +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.tilelink.TLMessages._ +import freechips.rocketchip.tilelink.TLPermissions._ +import org.chipsalliance.cde.config.Parameters +import coupledL2._ +import coupledL2.prefetch.{PrefetchTrain, PfSource} +import coupledL2.tl2chi.CHIOpcode._ +import coupledL2.tl2chi.CHICohStates._ +import coupledL2.MetaData._ +import coupledL2.utils.{XSPerfAccumulate, XSPerfHistogram} + +class MainPipe(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + /* receive task from arbiter at stage 2 */ + val taskFromArb_s2 = Flipped(ValidIO(new TaskBundle())) + /* status from arbiter at stage1 */ + val taskInfo_s1 = Flipped(ValidIO(new TaskBundle())) + + /* handle set conflict in req arb */ + val fromReqArb = Input(new Bundle() { + val status_s1 = new PipeEntranceStatus + }) + /* block B and C at Entrance */ + val toReqArb = Output(new BlockInfo()) + + /* block A at Entrance */ + val toReqBuf = Output(Vec(2, Bool())) + + /* handle capacity conflict of GrantBuffer */ + val status_vec_toD = Vec(3, ValidIO(new PipeStatus)) + /* handle capacity conflict of TX channels */ + val status_vec_toTX = Vec(3, ValidIO(new PipeStatusWithCHI)) + + /* get dir result at stage 3 */ + val dirResp_s3 = Input(new DirResult()) + val replResp = Flipped(ValidIO(new ReplacerResult())) + + /* send task to MSHRCtl at stage 3 */ + val toMSHRCtl = new Bundle() { + val mshr_alloc_s3 = ValidIO(new MSHRRequest()) + } + + val fromMSHRCtl = new Bundle() { + val mshr_alloc_ptr = Input(UInt(mshrBits.W)) + } + + /* read C-channel Release Data and write into DS */ + val bufResp = Input(new PipeBufferResp) + + /* get ReleaseBuffer and RefillBuffer read result */ + val refillBufResp_s3 = Flipped(ValidIO(new DSBlock)) + val releaseBufResp_s3 = Flipped(ValidIO(new DSBlock)) + + /* read or write data storage */ + val toDS = new Bundle() { + val req_s3 = ValidIO(new DSRequest) + val rdata_s5 = Input(new DSBlock) + val wdata_s3 = Output(new DSBlock) + } + + /* send Grant via SourceD channel */ + val toSourceD = DecoupledIO(new TaskWithData()) + + /* send req/Comp/CompData via TXREQ/TXRSP/TXDAT channel */ + val toTXREQ = DecoupledIO(new CHIREQ()) + val toTXRSP = DecoupledIO(new TaskBundle()) + val toTXDAT = DecoupledIO(new TaskWithData()) + + /* write dir, including reset dir */ + val metaWReq = ValidIO(new MetaWrite) + val tagWReq = ValidIO(new TagWrite) + + /* read DS and write data into ReleaseBuf when the task needs to replace */ + val releaseBufWrite = ValidIO(new MSHRBufWrite()) + + /* nested writeback */ + val nestedwb = Output(new NestedWriteback()) + val nestedwbData = Output(new DSBlock()) + + /* l2 refill hint */ + val l1Hint = DecoupledIO(new L2ToL1Hint()) + // val grantBufferHint = Flipped(ValidIO(new L2ToL1Hint())) + // val globalCounter = Input(UInt((log2Ceil(mshrsAll) + 1).W)) + + /* send prefetchTrain to Prefetch to trigger a prefetch req */ + val prefetchTrain = prefetchOpt.map(_ => DecoupledIO(new PrefetchTrain)) + + /* top-down monitor */ + // TODO + }) + + require(chiOpt.isDefined) + + val resetFinish = RegInit(false.B) + val resetIdx = RegInit((cacheParams.sets - 1).U) + /* block reqs when reset */ + when (!resetFinish) { + resetIdx := resetIdx - 1.U + } + when (resetIdx === 0.U) { + resetFinish := true.B + } + + val txreq_s3, txreq_s4, txreq_s5 = WireInit(0.U.asTypeOf(io.toTXREQ.cloneType)) + val txrsp_s3, txrsp_s4, txrsp_s5 = Wire(io.toTXRSP.cloneType) + val txdat_s3, txdat_s4, txdat_s5 = Wire(io.toTXDAT.cloneType) + val d_s3, d_s4, d_s5 = Wire(io.toSourceD.cloneType) + + /* ======== Stage 2 ======== */ + val task_s2 = io.taskFromArb_s2 + + /* ======== Stage 3 ======== */ + val task_s3 = RegInit(0.U.asTypeOf(Valid(new TaskBundle))) + task_s3.valid := task_s2.valid + when (task_s2.valid) { + task_s3.bits := task_s2.bits + } + + /* ======== Enchantment ======== */ + val dirResult_s3 = io.dirResp_s3 + val meta_s3 = dirResult_s3.meta + val req_s3 = task_s3.bits + + val mshr_req_s3 = req_s3.mshrTask + val sink_req_s3 = !mshr_req_s3 + val sinkA_req_s3 = !mshr_req_s3 && req_s3.fromA + val sinkB_req_s3 = !mshr_req_s3 && req_s3.fromB + val sinkC_req_s3 = !mshr_req_s3 && req_s3.fromC + + val req_acquire_s3 = sinkA_req_s3 && (req_s3.opcode === AcquireBlock || req_s3.opcode === AcquirePerm) + val req_acquireBlock_s3 = sinkA_req_s3 && req_s3.opcode === AcquireBlock + val req_prefetch_s3 = sinkA_req_s3 && req_s3.opcode === Hint + val req_get_s3 = sinkA_req_s3 && req_s3.opcode === Get + + val mshr_grant_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode(2, 1) === Grant(2, 1) // Grant or GrantData from mshr + val mshr_grantdata_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode === GrantData + val mshr_accessackdata_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode === AccessAckData + val mshr_hintack_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode === HintAck + + val mshr_snpResp_s3 = mshr_req_s3 && req_s3.toTXRSP && req_s3.chiOpcode.get === RSPOpcodes.SnpResp + val mshr_snpRespFwded_s3 = mshr_req_s3 && req_s3.toTXRSP && req_s3.chiOpcode.get === RSPOpcodes.SnpRespFwded + val mshr_snpRespData_s3 = mshr_req_s3 && req_s3.toTXDAT && req_s3.chiOpcode.get === DATOpcodes.SnpRespData + val mshr_snpRespDataPtl_s3 = mshr_req_s3 && req_s3.toTXDAT && req_s3.chiOpcode.get === DATOpcodes.SnpRespDataPtl + val mshr_snpRespDataFwded_s3 = mshr_req_s3 && req_s3.toTXDAT && req_s3.chiOpcode.get === DATOpcodes.SnpRespDataFwded + val mshr_snpRespX_s3 = mshr_snpResp_s3 || mshr_snpRespFwded_s3 + val mshr_snpRespDataX_s3 = mshr_snpRespData_s3 || mshr_snpRespDataPtl_s3 || mshr_snpRespDataFwded_s3 + + val mshr_dct_s3 = mshr_req_s3 && req_s3.toTXDAT && req_s3.chiOpcode.get === DATOpcodes.CompData + + val mshr_writeBackFull_s3 = mshr_req_s3 && req_s3.toTXREQ && req_s3.chiOpcode.get === REQOpcodes.WriteBackFull + val mshr_evict_s3 = mshr_req_s3 && req_s3.toTXREQ && req_s3.chiOpcode.get === REQOpcodes.Evict + + val mshr_cbWrData_s3 = mshr_req_s3 && req_s3.toTXDAT && req_s3.chiOpcode.get === DATOpcodes.CopyBackWrData + + val meta_has_clients_s3 = meta_s3.clients.orR + val req_needT_s3 = needT(req_s3.opcode, req_s3.param) + + val cache_alias = req_acquire_s3 && dirResult_s3.hit && meta_s3.clients(0) && + meta_s3.alias.getOrElse(0.U) =/= req_s3.alias.getOrElse(0.U) + + val mshr_refill_s3 = mshr_accessackdata_s3 || mshr_hintack_s3 || mshr_grant_s3 // needs refill to L2 DS + val retry = io.replResp.valid && io.replResp.bits.retry + val need_repl = io.replResp.valid && io.replResp.bits.meta.state =/= INVALID && req_s3.replTask + + /* ======== Interact with MSHR ======== */ + val acquire_on_miss_s3 = req_acquire_s3 || req_prefetch_s3 || req_get_s3 + val acquire_on_hit_s3 = meta_s3.state === BRANCH && req_needT_s3 && !req_prefetch_s3 + val need_acquire_s3_a = req_s3.fromA && Mux( + dirResult_s3.hit, + acquire_on_hit_s3, + acquire_on_miss_s3 + ) + val need_probe_s3_a = req_get_s3 && dirResult_s3.hit && meta_s3.state === TRUNK + + val need_mshr_s3_a = need_acquire_s3_a || need_probe_s3_a || cache_alias + + /** + * 1. For SnpOnce/SnpOnceFwd, only the latest copy of the cacheline is needed without changing the state of the + * cacheline at the snoopee. Therefore L2 should only send pProbe toT (to get the latest copy) when the state + * in L2 is TRUNK + * 2. For SnpClean/SnpCleanFwd, SnpShared/SnpSharedFwd, SnpNotSharedDirty/SnpNotSharedDirtyFwd, and SnpCleanShared, + * the snooped cacheline should be degraded into BRANCH state because there is no SharedDirty state or Owner + * state (of MOESI) in CoupledL2. Therefore L2 should only send pProbe toB to degrade upper clients when the + * state in L2 is TRUNK + * 3. For SnpUnique/SnpUniqueFwd/SnpUniqueStash, SnpCleanInvalid, SnpMakeInvalid/SnpMakeInvalidStash, the snooped + * cacheline should be degraded into INVALID state. Therefore L2 should only send pProbe toN to degrade upper + * clients when the state in L2 is TRUNK or BRANCH with clients.orR = 1 + * + */ + // whether L2 should do forwarding or not + val expectFwd = SNPOpcodes.isSnpXFwd(req_s3.chiOpcode.get) + val canFwd = dirResult_s3.hit + val doFwd = expectFwd && canFwd + val doFwdHitRelease = expectFwd && req_s3.snpHitRelease && req_s3.snpHitReleaseWithData + val need_pprobe_s3_b_snpOnceX = req_s3.fromB && SNPOpcodes.isSnpOnceX(req_s3.chiOpcode.get) && + dirResult_s3.hit && meta_s3.state === TRUNK && meta_has_clients_s3 + val need_pprobe_s3_b_snpToB = req_s3.fromB && ( + SNPOpcodes.isSnpToB(req_s3.chiOpcode.get) || + req_s3.chiOpcode.get === SNPOpcodes.SnpCleanShared + ) && dirResult_s3.hit && meta_s3.state === TRUNK && meta_has_clients_s3 + val need_pprobe_s3_b_snpToN = req_s3.fromB && ( + SNPOpcodes.isSnpUniqueX(req_s3.chiOpcode.get) || + req_s3.chiOpcode.get === SNPOpcodes.SnpCleanInvalid || + SNPOpcodes.isSnpMakeInvalidX(req_s3.chiOpcode.get) + ) && dirResult_s3.hit && meta_s3.state =/= TIP && meta_has_clients_s3 + val need_pprobe_s3_b = need_pprobe_s3_b_snpOnceX || need_pprobe_s3_b_snpToB || need_pprobe_s3_b_snpToN + val need_dct_s3_b = doFwd || doFwdHitRelease // DCT + val need_mshr_s3_b = need_pprobe_s3_b || need_dct_s3_b + + val need_mshr_s3 = need_mshr_s3_a || need_mshr_s3_b + + /* Signals to MSHR Ctl */ + val alloc_state = WireInit(0.U.asTypeOf(new FSMState())) + alloc_state.elements.foreach(_._2 := true.B) + io.toMSHRCtl.mshr_alloc_s3.valid := task_s3.valid && !mshr_req_s3 && need_mshr_s3 + io.toMSHRCtl.mshr_alloc_s3.bits.dirResult := dirResult_s3 + io.toMSHRCtl.mshr_alloc_s3.bits.state := alloc_state + io.toMSHRCtl.mshr_alloc_s3.bits.task match { case task => + task := req_s3 + task.bufIdx := 0.U(bufIdxBits.W) + task.mshrTask := false.B + task.aliasTask.foreach(_ := cache_alias) + task.wayMask := 0.U(cacheParams.ways.W) + // TODO + } + + /* ======== Resps to SinkA/B/C Reqs ======== */ + val sink_resp_s3 = WireInit(0.U.asTypeOf(Valid(new TaskBundle))) + val sink_resp_s3_a_promoteT = dirResult_s3.hit && isT(meta_s3.state) + + // whether L2 should respond data to HN or not + val retToSrc = req_s3.retToSrc.getOrElse(false.B) + val neverRespData = SNPOpcodes.isSnpMakeInvalidX(req_s3.chiOpcode.get) || + SNPOpcodes.isSnpStashX(req_s3.chiOpcode.get) || + req_s3.chiOpcode.get === SNPOpcodes.SnpOnceFwd || + req_s3.chiOpcode.get === SNPOpcodes.SnpUniqueFwd + val shouldRespData_dirty = dirResult_s3.hit && (meta_s3.state === TIP || meta_s3.state === TRUNK) && meta_s3.dirty + // For forwarding snoops, if the RetToSrc value is 1, must return a copy is the cache line is Dirty or Clean. + val shouldRespData_retToSrc_fwd = dirResult_s3.hit && retToSrc && SNPOpcodes.isSnpXFwd(req_s3.chiOpcode.get) + // For non-forwarding snoops, ig the RetToSrc value is 1, must return a copy if the cache line is Shared Clean and + // snoopee retains a copy of the cache line. + val shouldRespData_retToSrc_nonFwd = dirResult_s3.hit && retToSrc && meta_s3.state === BRANCH && ( + req_s3.chiOpcode.get === SNPOpcodes.SnpOnce || + req_s3.chiOpcode.get === SNPOpcodes.SnpUnique || + SNPOpcodes.isSnpToBNonFwd(req_s3.chiOpcode.get) + ) + val shouldRespData = shouldRespData_dirty || shouldRespData_retToSrc_fwd || shouldRespData_retToSrc_nonFwd + val doRespData = shouldRespData && !neverRespData + val doRespDataHitRelease = req_s3.snpHitRelease && req_s3.snpHitReleaseWithData && !neverRespData + dontTouch(doRespData) + dontTouch(shouldRespData) + dontTouch(neverRespData) + + // Resp[2: 0] = {PassDirty, CacheState[1: 0]} + val respCacheState = WireInit(I) + val respPassDirty = dirResult_s3.hit && meta_s3.state === TIP && meta_s3.dirty && + !(neverRespData || req_s3.chiOpcode.get === SNPOpcodes.SnpOnce) + when (dirResult_s3.hit) { + when (SNPOpcodes.isSnpToB(req_s3.chiOpcode.get)) { + respCacheState := SC + } + when (SNPOpcodes.isSnpOnceX(req_s3.chiOpcode.get) || SNPOpcodes.isSnpStashX(req_s3.chiOpcode.get)) { + respCacheState := Mux( + meta_s3.state === BRANCH, + SC, + Mux(meta_s3.dirty, UD, UC) + ) + } + when (req_s3.chiOpcode.get === SNPOpcodes.SnpCleanShared) { + respCacheState := Mux(meta_s3.state === BRANCH, SC, UC) + } + } + + // FwdState[2: 0] = {PassDirty, CacheState[1: 0]} + val fwdCacheState = WireInit(I) + val fwdPassDirty = WireInit(false.B) + when (dirResult_s3.hit) { + when (SNPOpcodes.isSnpToBFwd(req_s3.chiOpcode.get)) { + fwdCacheState := SC + } + when (req_s3.chiOpcode.get === SNPOpcodes.SnpUniqueFwd) { + when (meta_s3.state === TIP && meta_s3.dirty) { + fwdCacheState := UD + fwdPassDirty := true.B + }.otherwise { + fwdCacheState := UC + } + } + } + + sink_resp_s3.valid := task_s3.valid && !mshr_req_s3 && !need_mshr_s3 + sink_resp_s3.bits := task_s3.bits + sink_resp_s3.bits.mshrId := (1 << (mshrBits-1)).U + sink_resp_s3.bits.sourceId + when (req_s3.fromA) { + sink_resp_s3.bits.opcode := odOpGen(req_s3.opcode) + sink_resp_s3.bits.param := Mux ( + req_acquire_s3, + Mux(req_s3.param === NtoB && !sink_resp_s3_a_promoteT, toB, toT), + 0.U // reserved + ) + }.elsewhen (req_s3.fromB) { + + sink_resp_s3.bits.opcode := 0.U + sink_resp_s3.bits.param := 0.U + + sink_resp_s3.bits.tgtID.foreach(_ := task_s3.bits.srcID.get) + sink_resp_s3.bits.srcID.foreach(_ := task_s3.bits.tgtID.get) // TODO: srcID should be fixed. FIX THIS!!! + sink_resp_s3.bits.txnID.foreach(_ := task_s3.bits.txnID.get) + sink_resp_s3.bits.dbID.foreach(_ := 0.U) + sink_resp_s3.bits.pCrdType.foreach(_ := 0.U) // TODO + sink_resp_s3.bits.chiOpcode.foreach(_ := MuxLookup( + Cat(doFwd || doFwdHitRelease, doRespData || doRespDataHitRelease), + RSPOpcodes.SnpResp + )(Seq( + Cat(false.B, false.B) -> RSPOpcodes.SnpResp, + Cat(true.B, false.B) -> RSPOpcodes.SnpRespFwded, + Cat(false.B, true.B) -> DATOpcodes.SnpRespData, // ignore SnpRespDataPtl for now + Cat(true.B, true.B) -> DATOpcodes.SnpRespDataFwded + ))) + sink_resp_s3.bits.resp.foreach(_ := Mux( + req_s3.snpHitRelease && !SNPOpcodes.isSnpStashX(req_s3.chiOpcode.get), + setPD(I, req_s3.snpHitReleaseWithData && !SNPOpcodes.isSnpMakeInvalidX(req_s3.chiOpcode.get)), + setPD(respCacheState, respPassDirty && (doRespData || doRespDataHitRelease)) + )) + sink_resp_s3.bits.fwdState.foreach(_ := setPD(fwdCacheState, fwdPassDirty)) + sink_resp_s3.bits.txChannel := Cat( + doRespData || doRespDataHitRelease, + !(doRespData || doRespDataHitRelease), + false.B + ) // TODO: parameterize this + sink_resp_s3.bits.size := log2Ceil(blockBytes).U + + }.otherwise { // req_s3.fromC + sink_resp_s3.bits.opcode := ReleaseAck + sink_resp_s3.bits.param := 0.U // param of ReleaseAck must be 0 + } + + val source_req_s3 = Wire(new TaskBundle) + source_req_s3 := Mux(sink_resp_s3.valid, sink_resp_s3.bits, req_s3) + source_req_s3.isKeyword.foreach(_ := req_s3.isKeyword.getOrElse(false.B)) + + /* ======== Interact with DS ======== */ + val data_s3 = Mux(io.releaseBufResp_s3.valid, io.releaseBufResp_s3.bits.data, io.refillBufResp_s3.bits.data) + val c_releaseData_s3 = io.bufResp.data.asUInt + val hasData_s3_tl = source_req_s3.opcode(0) // whether to respond data to TileLink-side + val hasData_s3_chi = source_req_s3.toTXDAT // whether to respond data to CHI-side + val hasData_s3 = hasData_s3_tl || hasData_s3_chi + + val need_data_a = dirResult_s3.hit && (req_get_s3 || req_acquireBlock_s3) + val need_data_b = sinkB_req_s3 && (doRespData || doFwd || dirResult_s3.hit && meta_s3.state === TRUNK) + val need_data_mshr_repl = mshr_refill_s3 && need_repl && !retry + val ren = need_data_a || need_data_b || need_data_mshr_repl + + val wen_c = sinkC_req_s3 && isParamFromT(req_s3.param) && req_s3.opcode(0) && dirResult_s3.hit + val wen_mshr = req_s3.dsWen && ( + mshr_snpRespX_s3 || mshr_snpRespDataX_s3 || mshr_writeBackFull_s3 || mshr_evict_s3 || + mshr_refill_s3 && !need_repl && !retry + ) + val wen = wen_c || wen_mshr + + io.toDS.req_s3.valid := task_s3.valid && (ren || wen) + io.toDS.req_s3.bits.way := Mux( + mshr_refill_s3 && req_s3.replTask, + io.replResp.bits.way, + Mux(mshr_req_s3, req_s3.way, dirResult_s3.way) + ) + io.toDS.req_s3.bits.set := Mux(mshr_req_s3, req_s3.set, dirResult_s3.set) + io.toDS.req_s3.bits.wen := wen + io.toDS.wdata_s3.data := Mux( + !mshr_req_s3, + c_releaseData_s3, + Mux( + req_s3.useProbeData, + io.releaseBufResp_s3.bits.data, + io.refillBufResp_s3.bits.data + ) + ) + + /* ======== Read DS and store data in Buffer ======== */ + // A: need_write_releaseBuf indicates that DS should be read and the data will be written into ReleaseBuffer + // need_write_releaseBuf is assigned true when: + // inner clients' data is needed, but whether the client will ack data is uncertain, so DS data is also needed + val need_write_releaseBuf = need_probe_s3_a || + cache_alias || + need_data_b && need_mshr_s3_b || + need_data_mshr_repl + // B: need_write_refillBuf indicates that DS should be read and the data will be written into RefillBuffer + // when L1 AcquireBlock but L2 AcquirePerm to L3, we need to prepare data for L1 + // but this will no longer happen, cuz we always AcquireBlock for L1 AcquireBlock + val need_write_refillBuf = false.B + + /* ======== Write Directory ======== */ + val metaW_valid_s3_a = sinkA_req_s3 && !need_mshr_s3_a && !req_get_s3 && !req_prefetch_s3 // get & prefetch that hit will not write meta + val metaW_valid_s3_b = sinkB_req_s3 && !need_mshr_s3_b && dirResult_s3.hit && + !SNPOpcodes.isSnpOnceX(req_s3.chiOpcode.get) && !SNPOpcodes.isSnpStashX(req_s3.chiOpcode.get) && ( + meta_s3.state === TIP || meta_s3.state === BRANCH && SNPOpcodes.isSnpToN(req_s3.chiOpcode.get) + ) + val metaW_valid_s3_c = sinkC_req_s3 && dirResult_s3.hit + val metaW_valid_s3_mshr = mshr_req_s3 && req_s3.metaWen && !(mshr_refill_s3 && retry) + require(clientBits == 1) + + val metaW_s3_a_alias = Mux( + req_get_s3 || req_prefetch_s3, + meta_s3.alias.getOrElse(0.U), + req_s3.alias.getOrElse(0.U) + ) + val metaW_s3_a = MetaEntry( + dirty = meta_s3.dirty, + state = Mux(req_needT_s3 || sink_resp_s3_a_promoteT, TRUNK, meta_s3.state), + clients = Fill(clientBits, true.B), + alias = Some(metaW_s3_a_alias), + accessed = true.B + ) + val metaW_s3_b = Mux(SNPOpcodes.isSnpToN(req_s3.chiOpcode.get), MetaEntry(), + MetaEntry( + dirty = false.B, + state = Mux(req_s3.chiOpcode.get === SNPOpcodes.SnpCleanShared, meta_s3.state, BRANCH), + clients = meta_s3.clients, + alias = meta_s3.alias, + accessed = meta_s3.accessed + ) + ) + val metaW_s3_c = MetaEntry( + dirty = meta_s3.dirty || wen_c, + state = Mux(isParamFromT(req_s3.param), TIP, meta_s3.state), + clients = Fill(clientBits, !isToN(req_s3.param)), + alias = meta_s3.alias, + accessed = meta_s3.accessed + ) + // use merge_meta if mergeA + val metaW_s3_mshr = Mux(req_s3.mergeA, req_s3.aMergeTask.meta, req_s3.meta) + + val metaW_way = Mux( + mshr_refill_s3 && req_s3.replTask, + io.replResp.bits.way, // grant always use replResp way + Mux(mshr_req_s3, req_s3.way, dirResult_s3.way) + ) + + io.metaWReq.valid := !resetFinish || task_s3.valid && ( + metaW_valid_s3_a || metaW_valid_s3_b || metaW_valid_s3_c || metaW_valid_s3_mshr + ) + io.metaWReq.bits.set := Mux(resetFinish, req_s3.set, resetIdx) + io.metaWReq.bits.wayOH := Mux(resetFinish, UIntToOH(metaW_way), Fill(cacheParams.ways, true.B)) + io.metaWReq.bits.wmeta := Mux( + resetFinish, + ParallelPriorityMux( + Seq(metaW_valid_s3_a, metaW_valid_s3_b, metaW_valid_s3_c, metaW_valid_s3_mshr), + Seq(metaW_s3_a, metaW_s3_b, metaW_s3_c, metaW_s3_mshr) + ), + MetaEntry() + ) + + io.tagWReq.valid := task_s3.valid && req_s3.tagWen && mshr_refill_s3 && !retry + io.tagWReq.bits.set := req_s3.set + io.tagWReq.bits.way := Mux(mshr_refill_s3 && req_s3.replTask, io.replResp.bits.way, req_s3.way) + io.tagWReq.bits.wtag := req_s3.tag + + /* ======== Interact with Channels (SourceD/TXREQ/TXRSP/TXDAT) ======== */ + val chnl_fire_s3 = d_s3.fire || txreq_s3.fire || txrsp_s3.fire || txdat_s3.fire + val req_drop_s3 = !need_write_releaseBuf && ( + !mshr_req_s3 && need_mshr_s3 || chnl_fire_s3 + ) || mshr_refill_s3 && retry + + val data_unready_s3 = hasData_s3 && !mshr_req_s3 + val data_unready_s3_tl = hasData_s3_tl && !mshr_req_s3 + /** + * The combinational logic path of + * Directory metaAll + * -> Directory response + * -> MainPipe judging whether to respond data + * is too long. Therefore the sinkB response may be latched to s4 for better timing. + */ + val d_s3_latch = true + val txdat_s3_latch = true + val isD_s3 = Mux( + mshr_req_s3, + mshr_refill_s3 && !retry, + req_s3.fromC || req_s3.fromA && !need_mshr_s3_a && !data_unready_s3_tl && req_s3.opcode =/= Hint + ) + val isD_s3_ready = Mux( + mshr_req_s3, + mshr_refill_s3 && !retry, + req_s3.fromC || req_s3.fromA && !need_mshr_s3_a && !data_unready_s3_tl && req_s3.opcode =/= Hint && !d_s3_latch.B + ) + val isTXRSP_s3 = Mux( + mshr_req_s3, + mshr_snpRespX_s3, + req_s3.fromB && !need_mshr_s3 && !hasData_s3 + ) + val isTXDAT_s3 = Mux( + mshr_req_s3, + mshr_snpRespDataX_s3 || mshr_cbWrData_s3 || mshr_dct_s3, + req_s3.fromB && !need_mshr_s3 && (doRespDataHitRelease || doRespData && !data_unready_s3) + ) + val isTXDAT_s3_ready = Mux( + mshr_req_s3, + mshr_snpRespDataX_s3 || mshr_cbWrData_s3 || mshr_dct_s3, + req_s3.fromB && !need_mshr_s3 && (doRespDataHitRelease || doRespData && !data_unready_s3) && !txdat_s3_latch.B + ) + val isTXREQ_s3 = mshr_req_s3 && (mshr_writeBackFull_s3 || mshr_evict_s3) + + txreq_s3.valid := task_s3.valid && isTXREQ_s3 + txrsp_s3.valid := task_s3.valid && isTXRSP_s3 + txdat_s3.valid := task_s3.valid && isTXDAT_s3_ready + d_s3.valid := task_s3.valid && isD_s3_ready + txreq_s3.bits := source_req_s3.toCHIREQBundle() + txrsp_s3.bits := source_req_s3 + txdat_s3.bits.task := source_req_s3 + txdat_s3.bits.data.data := data_s3 + d_s3.bits.task := source_req_s3 + d_s3.bits.data.data := data_s3 + + when (task_s3.valid) { + OneHot.checkOneHot(Seq(isTXREQ_s3, isTXRSP_s3, isTXDAT_s3, isD_s3)) + } + + /* ======== nested writeback ======== */ + io.nestedwb.set := req_s3.set + io.nestedwb.tag := req_s3.tag + // This serves as VALID signal + // c_set_dirty is true iff Release has Data + io.nestedwb.c_set_dirty := task_s3.valid && task_s3.bits.fromC && task_s3.bits.opcode === ReleaseData && task_s3.bits.param === TtoN + /** + * Snoop nesting happens when: + * 1. snoop nests a copy-back request + * 2. snoop nests a Read/MakeUnique request + */ + io.nestedwb.b_inv_dirty := task_s3.valid && task_s3.bits.fromB && source_req_s3.snpHitRelease + io.nestedwb.b_toB.foreach(_ := + task_s3.valid && task_s3.bits.fromB && source_req_s3.metaWen && source_req_s3.meta.state === BRANCH + ) + io.nestedwb.b_toN.foreach(_ := + task_s3.valid && task_s3.bits.fromB && source_req_s3.metaWen && source_req_s3.meta.state === INVALID + ) + + io.nestedwbData := c_releaseData_s3.asTypeOf(new DSBlock) + + // TODO: add nested writeback from Snoop + + /* ======== prefetch ======== */ + io.prefetchTrain.foreach { + train => + // train on request(with needHint flag) miss or hit on prefetched block + // trigger train also in a_merge here + train.valid := task_s3.valid && ((req_acquire_s3 || req_get_s3) && req_s3.needHint.getOrElse(false.B) && + (!dirResult_s3.hit || meta_s3.prefetch.get) || req_s3.mergeA) + train.bits.tag := req_s3.tag + train.bits.set := req_s3.set + train.bits.needT := Mux( + req_s3.mergeA, + needT(req_s3.aMergeTask.opcode, req_s3.aMergeTask.param), + req_needT_s3 + ) + train.bits.source := Mux(req_s3.mergeA, req_s3.aMergeTask.sourceId, req_s3.sourceId) + train.bits.vaddr.foreach(_ := Mux(req_s3.mergeA, req_s3.aMergeTask.vaddr.getOrElse(0.U), req_s3.vaddr.getOrElse(0.U))) + train.bits.hit := Mux(req_s3.mergeA, true.B, dirResult_s3.hit) + train.bits.prefetched := Mux(req_s3.mergeA, true.B, meta_s3.prefetch.getOrElse(false.B)) + train.bits.pfsource := meta_s3.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) // TODO + train.bits.reqsource := req_s3.reqSource + } + + /* ======== Stage 4 ======== */ + val task_s4 = RegInit(0.U.asTypeOf(Valid(new TaskBundle()))) + val data_unready_s4 = RegInit(false.B) + val data_s4 = Reg(UInt((blockBytes * 8).W)) + val ren_s4 = RegInit(false.B) + val need_write_releaseBuf_s4 = RegInit(false.B) + val isD_s4, isTXREQ_s4, isTXRSP_s4, isTXDAT_s4 = RegInit(false.B) + val pendingTXDAT_s4 = task_s4.bits.fromB && !task_s4.bits.mshrTask && task_s4.bits.toTXDAT + val pendingD_s4 = task_s4.bits.fromA && !task_s4.bits.mshrTask && ( + task_s4.bits.opcode === GrantData || task_s4.bits.opcode === AccessAckData + ) + + task_s4.valid := task_s3.valid && !req_drop_s3 + + when (task_s3.valid && !req_drop_s3) { + task_s4.bits := source_req_s3 + + when (!task_s3.bits.mshrTask && need_mshr_s3) { + task_s4.bits.mshrId := io.fromMSHRCtl.mshr_alloc_ptr + } + + data_unready_s4 := data_unready_s3 + data_s4 := data_s3 + ren_s4 := ren + need_write_releaseBuf_s4 := need_write_releaseBuf + isD_s4 := isD_s3 + isTXREQ_s4 := isTXREQ_s3 + isTXRSP_s4 := isTXRSP_s3 + isTXDAT_s4 := isTXDAT_s3 + } + + // for reqs that CANNOT give response in MainPipe, but needs to write releaseBuf/refillBuf + // we cannot drop them at s3, we must let them go to s4/s5 + val chnl_fire_s4 = d_s4.fire || txreq_s4.fire || txrsp_s4.fire || txdat_s4.fire + val req_drop_s4 = !need_write_releaseBuf_s4 && chnl_fire_s4 + + val chnl_valid_s4 = task_s4.valid && !RegNext(chnl_fire_s3, false.B) + d_s4.valid := chnl_valid_s4 && isD_s4 + txreq_s4.valid := chnl_valid_s4 && isTXREQ_s4 + txrsp_s4.valid := chnl_valid_s4 && isTXRSP_s4 + txdat_s4.valid := chnl_valid_s4 && isTXDAT_s4 + d_s4.bits.task := task_s4.bits + d_s4.bits.data.data := data_s4 + txreq_s4.bits := task_s4.bits.toCHIREQBundle() + txrsp_s4.bits := task_s4.bits + txdat_s4.bits.task := task_s4.bits + txdat_s4.bits.data.data := data_s4 + + /* ======== Stage 5 ======== */ + val task_s5 = RegInit(0.U.asTypeOf(Valid(new TaskBundle()))) + val ren_s5 = RegInit(false.B) + val data_s5 = Reg(UInt((blockBytes * 8).W)) + val need_write_releaseBuf_s5 = RegInit(false.B) + val isD_s5, isTXREQ_s5, isTXRSP_s5, isTXDAT_s5 = RegInit(false.B) + + + task_s5.valid := task_s4.valid && !req_drop_s4 + + when (task_s4.valid && !req_drop_s4) { + task_s5.bits := task_s4.bits + ren_s5 := ren_s4 + data_s5 := data_s4 + need_write_releaseBuf_s5 := need_write_releaseBuf_s4 + isD_s5 := isD_s4 || pendingD_s4 + isTXREQ_s5 := isTXREQ_s4 + isTXRSP_s5 := isTXRSP_s4 + isTXDAT_s5 := isTXDAT_s4 || pendingTXDAT_s4 + } + val rdata_s5 = io.toDS.rdata_s5.data + val out_data_s5 = Mux(task_s5.bits.mshrTask || task_s5.bits.snpHitReleaseWithData, data_s5, rdata_s5) + val chnl_fire_s5 = d_s5.fire || txreq_s5.fire || txrsp_s5.fire || txdat_s5.fire + + // TODO: check this + val customL1Hint = Module(new CustomL1Hint) + + customL1Hint.io.s1 := io.taskInfo_s1 + // customL1Hint.io.s2 := task_s2 + + customL1Hint.io.s3.task := task_s3 + // overwrite opcode: if sinkReq can respond, use sink_resp_s3.bits.opcode = Grant/GrantData + customL1Hint.io.s3.task.bits.opcode := Mux(sink_resp_s3.valid, sink_resp_s3.bits.opcode, task_s3.bits.opcode) + // customL1Hint.io.s3.d := d_s3.valid + customL1Hint.io.s3.need_mshr := need_mshr_s3 + + // customL1Hint.io.s4.task := task_s4 + // customL1Hint.io.s4.d := d_s4.valid + // customL1Hint.io.s4.need_write_releaseBuf := need_write_releaseBuf_s4 + + // customL1Hint.io.s5.task := task_s5 + // customL1Hint.io.s5.d := d_s5.valid + + // customL1Hint.io.globalCounter := io.globalCounter + // customL1Hint.io.grantBufferHint <> io.grantBufferHint + + customL1Hint.io.l1Hint <> io.l1Hint + + io.releaseBufWrite.valid := task_s5.valid && need_write_releaseBuf_s5 + io.releaseBufWrite.bits.id := task_s5.bits.mshrId + io.releaseBufWrite.bits.data.data := rdata_s5 + io.releaseBufWrite.bits.beatMask := Fill(beatSize, true.B) + + val chnl_valid_s5 = task_s5.valid && !RegNext(chnl_fire_s4, false.B) && !RegNextN(chnl_fire_s3, 2, Some(false.B)) + d_s5.valid := chnl_valid_s5 && isD_s5 + txreq_s5.valid := chnl_valid_s5 && isTXREQ_s5 + txrsp_s5.valid := chnl_valid_s5 && isTXRSP_s5 + txdat_s5.valid := chnl_valid_s5 && isTXDAT_s5 + d_s5.bits.task := task_s5.bits + d_s5.bits.data.data := out_data_s5 + txreq_s5.bits := task_s5.bits.toCHIREQBundle() + txrsp_s5.bits := task_s5.bits + txdat_s5.bits.task := task_s5.bits + txdat_s5.bits.data.data := out_data_s5 + + /* ======== BlockInfo ======== */ + // if s2/s3 might write Dir, we must block s1 sink entrance + // TODO:[Check] it seems that s3 Dir write will naturally block all s1 by dirRead.ready + // (an even stronger blocking than set blocking) + // so we might not need s3 blocking here + def s23Block(chn: Char, s: TaskBundle): Bool = { + val s1 = io.fromReqArb.status_s1 + val s1_set = chn match { + case 'a' => s1.a_set + case 'b' => s1.b_set + case 'c' => s1.c_set + case 'g' => s1.g_set + } + s.set === s1_set && !(s.mshrTask && !s.metaWen) // if guaranteed not to write meta, no blocking needed + } + def bBlock(s: TaskBundle, tag: Boolean = false): Bool = { + val s1 = io.fromReqArb.status_s1 + // tag true: compare tag + set + s.set === s1.b_set && (if(tag) s.tag === s1.b_tag else true.B) + } + + io.toReqBuf(0) := task_s2.valid && s23Block('a', task_s2.bits) + io.toReqBuf(1) := task_s3.valid && s23Block('a', task_s3.bits) + + io.toReqArb.blockC_s1 := task_s2.valid && s23Block('c', task_s2.bits) + + io.toReqArb.blockB_s1 := + task_s2.valid && bBlock(task_s2.bits) || + task_s3.valid && bBlock(task_s3.bits) || + task_s4.valid && bBlock(task_s4.bits, tag = true) || + task_s5.valid && bBlock(task_s5.bits, tag = true) + + io.toReqArb.blockA_s1 := io.toReqBuf(0) || io.toReqBuf(1) + + io.toReqArb.blockG_s1 := task_s2.valid && s23Block('g', task_s2.bits) + + /* ======== Pipeline Status ======== */ + require(io.status_vec_toD.size == 3) + io.status_vec_toD(0).valid := task_s3.valid && Mux( + mshr_req_s3, + mshr_refill_s3 && !retry, + true.B + // TODO: + // To consider grantBuffer capacity conflict, only " req_s3.fromC || req_s3.fromA && !need_mshr_s3 " is needed + // But to consider mshrFull, all channel_reqs are needed + // so maybe it is excessive for grantBuf capacity conflict + ) + + io.status_vec_toD(0).bits.channel := task_s3.bits.channel + io.status_vec_toD(1).valid := task_s4.valid && (isD_s4 || pendingD_s4) + io.status_vec_toD(1).bits.channel := task_s4.bits.channel + io.status_vec_toD(2).valid := d_s5.valid + io.status_vec_toD(2).bits.channel := task_s5.bits.channel + + // capacity control of TX channels + val tx_task_s3 = Wire(Valid(new TaskBundle)) + tx_task_s3.valid := task_s3.valid // TODO: review this + tx_task_s3.bits := source_req_s3 + val tasks = Seq(tx_task_s3, task_s4, task_s5) + io.status_vec_toTX.zip(tasks).foreach { case (status, task) => + status.valid := task.valid + status.bits.channel := task.bits.channel + // To optimize timing, we restrict the blocking condition of TXRSP and TXDAT. + // This may be inaccurate, but it works. + status.bits.txChannel := Cat( + // TXDAT + !neverRespData, + // TXRSP + !doRespDataHitRelease, + // TXREQ + task.bits.toTXREQ + ) + status.bits.mshrTask := task.bits.mshrTask + } + + /* ======== Other Signals Assignment ======== */ + // Initial state assignment + // ! Caution: s_ and w_ are false-as-valid + when (req_s3.fromA) { + alloc_state.s_refill := false.B + alloc_state.w_replResp := dirResult_s3.hit + // need Acquire downwards + when (need_acquire_s3_a) { + alloc_state.s_acquire := false.B + alloc_state.s_compack.get := false.B + alloc_state.w_grantfirst := false.B + alloc_state.w_grantlast := false.B + alloc_state.w_grant := false.B + } + // need Probe for alias + // need Probe when Get hits on a TRUNK block + when (cache_alias || need_probe_s3_a) { + alloc_state.s_rprobe := false.B + alloc_state.w_rprobeackfirst := false.B + alloc_state.w_rprobeacklast := false.B + } + } + + when (req_s3.fromB) { + alloc_state.s_probeack := false.B + // need pprobe + when (need_pprobe_s3_b) { + alloc_state.s_pprobe := false.B + alloc_state.w_pprobeackfirst := false.B + alloc_state.w_pprobeacklast := false.B + alloc_state.w_pprobeack := false.B + } + // need forwarding response + when (need_dct_s3_b) { + alloc_state.s_dct.get := false.B + } + } + + val d = Seq(d_s5, d_s4, d_s3) + val txreq = Seq(txreq_s5, txreq_s4, txreq_s3) + val txrsp = Seq(txrsp_s5, txrsp_s4, txrsp_s3) + val txdat = Seq(txdat_s5, txdat_s4, txdat_s3) + // DO NOT use TLArbiter because TLArbiter will send continuous beats for the same source + arb(d, io.toSourceD, Some("toSourceD")) + arb(txreq, io.toTXREQ, Some("toTXREQ")) + arb(txrsp, io.toTXRSP, Some("toTXRSP")) + arb(txdat, io.toTXDAT, Some("toTXDAT")) + + + /* ===== Performance counters ===== */ + // num of mshr req + XSPerfAccumulate(cacheParams, "mshr_grant_req", task_s3.valid && mshr_grant_s3 && !retry) + XSPerfAccumulate(cacheParams, "mshr_grantdata_req", task_s3.valid && mshr_grantdata_s3 && !retry) + XSPerfAccumulate(cacheParams, "mshr_accessackdata_req", task_s3.valid && mshr_accessackdata_s3 && !retry) + XSPerfAccumulate(cacheParams, "mshr_hintack_req", task_s3.valid && mshr_hintack_s3 && !retry) + // XSPerfAccumulate(cacheParams, "mshr_probeack_req", task_s3.valid && mshr_probeack_s3) + // XSPerfAccumulate(cacheParams, "mshr_probeackdata_req", task_s3.valid && mshr_probeackdata_s3) + // XSPerfAccumulate(cacheParams, "mshr_release_req", task_s3.valid && mshr_release_s3) + XSPerfAccumulate(cacheParams, "mshr_snpResp_req", task_s3.valid && mshr_snpResp_s3) + XSPerfAccumulate(cacheParams, "mshr_snpRespFwded_req", task_s3.valid && mshr_snpRespFwded_s3) + XSPerfAccumulate(cacheParams, "mshr_snpRespData_req", task_s3.valid && mshr_snpRespData_s3) + XSPerfAccumulate(cacheParams, "mshr_snpRespDataPtl_req", task_s3.valid && mshr_snpRespDataPtl_s3) + XSPerfAccumulate(cacheParams, "mshr_snpRespDataFwded_req", task_s3.valid && mshr_snpRespDataFwded_s3) + XSPerfAccumulate(cacheParams, "mshr_writeBackFull", task_s3.valid && mshr_writeBackFull_s3) + XSPerfAccumulate(cacheParams, "mshr_evict_s3", task_s3.valid && mshr_evict_s3) + + + // directory access result + val hit_s3 = task_s3.valid && !mshr_req_s3 && dirResult_s3.hit + val miss_s3 = task_s3.valid && !mshr_req_s3 && !dirResult_s3.hit + XSPerfAccumulate(cacheParams, "a_req_hit", hit_s3 && req_s3.fromA) + XSPerfAccumulate(cacheParams, "acquire_hit", hit_s3 && req_s3.fromA && + (req_s3.opcode === AcquireBlock || req_s3.opcode === AcquirePerm)) + XSPerfAccumulate(cacheParams, "get_hit", hit_s3 && req_s3.fromA && req_s3.opcode === Get) + XSPerfAccumulate(cacheParams, "retry", mshr_refill_s3 && retry) + + XSPerfAccumulate(cacheParams, "a_req_miss", miss_s3 && req_s3.fromA) + XSPerfAccumulate(cacheParams, "acquire_miss", miss_s3 && req_s3.fromA && + (req_s3.opcode === AcquireBlock || req_s3.opcode === AcquirePerm)) + XSPerfAccumulate(cacheParams, "get_miss", miss_s3 && req_s3.fromA && req_s3.opcode === Get) + + XSPerfAccumulate(cacheParams, "b_req_hit", hit_s3 && req_s3.fromB) + XSPerfAccumulate(cacheParams, "b_req_miss", miss_s3 && req_s3.fromB) + + XSPerfHistogram(cacheParams, "a_req_access_way", perfCnt = dirResult_s3.way, + enable = task_s3.valid && !mshr_req_s3 && req_s3.fromA, start = 0, stop = cacheParams.ways, step = 1) + XSPerfHistogram(cacheParams, "a_req_hit_way", perfCnt = dirResult_s3.way, + enable = hit_s3 && req_s3.fromA, start = 0, stop = cacheParams.ways, step = 1) + XSPerfHistogram(cacheParams, "a_req_miss_way_choice", perfCnt = dirResult_s3.way, + enable = miss_s3 && req_s3.fromA, start = 0, stop = cacheParams.ways, step = 1) + + // pipeline stages for TX and sourceD reqs + val pipe_len = Seq(5.U, 4.U, 3.U) + val sourceD_pipe_len = ParallelMux(d.map(_.fire), pipe_len) + val txreq_pipe_len = ParallelMux(txreq.map(_.fire), pipe_len) + val txrsp_pipe_len = ParallelMux(txrsp.map(_.fire), pipe_len) + val txdat_pipe_len = ParallelMux(txdat.map(_.fire), pipe_len) + XSPerfHistogram(cacheParams, "sourceD_pipeline_stages", sourceD_pipe_len, + enable = io.toSourceD.fire, start = 3, stop = 5+1, step = 1) + XSPerfHistogram(cacheParams, "txreq_pipeline_stages", txreq_pipe_len, + enable = io.toTXREQ.fire, start = 3, stop = 5+1, step = 1) + XSPerfHistogram(cacheParams, "txrsp_pipeline_stages", txrsp_pipe_len, + enable = io.toTXRSP.fire, start = 3, stop = 5+1, step = 1) + XSPerfHistogram(cacheParams, "txdat_pipeline_stages", txdat_pipe_len, + enable = io.toTXDAT.fire, start = 3, stop = 5+1, step = 1) + + // XSPerfAccumulate(cacheParams, "a_req_tigger_prefetch", io.prefetchTrain.) + prefetchOpt.foreach { + _ => + XSPerfAccumulate(cacheParams, "a_req_trigger_prefetch", io.prefetchTrain.get.fire) + XSPerfAccumulate(cacheParams, "a_req_trigger_prefetch_not_ready", io.prefetchTrain.get.valid && !io.prefetchTrain.get.ready) + XSPerfAccumulate(cacheParams, "acquire_trigger_prefetch_on_miss", io.prefetchTrain.get.fire && req_acquire_s3 && !dirResult_s3.hit) + XSPerfAccumulate(cacheParams, "acquire_trigger_prefetch_on_hit_pft", io.prefetchTrain.get.fire && req_acquire_s3 && dirResult_s3.hit && meta_s3.prefetch.get) + // TODO + // XSPerfAccumulate(cacheParams, "release_all", mshr_release_s3) + // XSPerfAccumulate(cacheParams, "release_prefetch_accessed", mshr_release_s3 && meta_s3.prefetch.get && meta_s3.accessed) + // XSPerfAccumulate(cacheParams, "release_prefetch_not_accessed", mshr_release_s3 && meta_s3.prefetch.get && !meta_s3.accessed) + XSPerfAccumulate(cacheParams, "get_trigger_prefetch_on_miss", io.prefetchTrain.get.fire && req_get_s3 && !dirResult_s3.hit) + XSPerfAccumulate(cacheParams, "get_trigger_prefetch_on_hit_pft", io.prefetchTrain.get.fire && req_get_s3 && dirResult_s3.hit && meta_s3.prefetch.get) + } + + XSPerfAccumulate(cacheParams, "early_prefetch", meta_s3.prefetch.getOrElse(false.B) && !meta_s3.accessed && !dirResult_s3.hit && task_s3.valid) + +} \ No newline at end of file diff --git a/src/main/scala/coupledL2/tl2chi/RXDAT.scala b/src/main/scala/coupledL2/tl2chi/RXDAT.scala new file mode 100644 index 00000000..d41a1e3b --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/RXDAT.scala @@ -0,0 +1,67 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import utility._ +import org.chipsalliance.cde.config.Parameters +import coupledL2.{RespBundle, MSHRBufWrite} + +class RXDAT(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + val out = Flipped(DecoupledIO(new CHIDAT())) + val in = Output(new RespBundle()) + val refillBufWrite = ValidIO(new MSHRBufWrite()) + }) + + /* RXDAT for Transactions: CompData */ + + // TODO: parameterize this + // For bus width is 256-bit + val first = (io.out.bits.dataID === "b00".U) + val last = (io.out.bits.dataID === "b10".U) + + /* Write Refill Buffer*/ + io.refillBufWrite.valid := io.out.valid + io.refillBufWrite.bits.id := io.out.bits.txnID + io.refillBufWrite.bits.data.data := Fill(beatSize, io.out.bits.data) + io.refillBufWrite.bits.beatMask := Cat(last, first) + + /* Response to MSHR */ + io.in.valid := (first || last) && io.out.valid + io.in.mshrId := io.out.bits.txnID + io.in.set := 0.U(setBits.W) + io.in.tag := 0.U(tagBits.W) + + io.in.respInfo.opcode := DontCare + io.in.respInfo.param := DontCare + io.in.respInfo.last := last + io.in.respInfo.dirty := DontCare + io.in.respInfo.isHit := DontCare + io.in.respInfo.chiOpcode.get := io.out.bits.opcode + io.in.respInfo.txnID.get := io.out.bits.txnID + io.in.respInfo.srcID.get := io.out.bits.srcID + io.in.respInfo.homeNID.get := io.out.bits.homeNID + io.in.respInfo.dbID.get := io.out.bits.dbID + io.in.respInfo.resp.get := io.out.bits.resp + io.in.respInfo.pCrdType.get := DontCare // RXDAT Channel does not have a pCrdType field + + io.out.ready := true.B + +} diff --git a/src/main/scala/coupledL2/tl2chi/RXRSP.scala b/src/main/scala/coupledL2/tl2chi/RXRSP.scala new file mode 100644 index 00000000..16ba842b --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/RXRSP.scala @@ -0,0 +1,53 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import utility._ +import org.chipsalliance.cde.config.Parameters +import coupledL2.RespBundle + +class RXRSP(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + val out = Flipped(DecoupledIO(new CHIRSP())) + val in = Output(new RespBundle()) + }) + + /* RXRSP for Transactions: + 1. Comp + 2. CompDBIDResp + 3. RetryAck + 4. PCrdGrant + */ + io.in.valid := io.out.valid + io.in.mshrId := io.out.bits.txnID + io.in.set := 0.U(setBits.W) + io.in.tag := 0.U(tagBits.W) + + io.in.respInfo := 0.U.asTypeOf(io.in.respInfo.cloneType) + io.in.respInfo.chiOpcode.get := io.out.bits.opcode + io.in.respInfo.txnID.get := io.out.bits.txnID + io.in.respInfo.srcID.get := io.out.bits.srcID + io.in.respInfo.dbID.get := io.out.bits.dbID + io.in.respInfo.pCrdType.get := io.out.bits.pCrdType + io.in.respInfo.last := true.B + + io.out.ready := true.B + +} diff --git a/src/main/scala/coupledL2/tl2chi/RXSNP.scala b/src/main/scala/coupledL2/tl2chi/RXSNP.scala new file mode 100644 index 00000000..8d64e128 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/RXSNP.scala @@ -0,0 +1,147 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import utility._ +import org.chipsalliance.cde.config.Parameters +import scala.collection.View.Fill +import coupledL2.{TaskBundle, MSHRInfo, MetaEntry, MergeTaskBundle} +import coupledL2.MetaData._ + +class RXSNP( + lCreditNum: Int = 4 // the number of L-Credits that a receiver can provide +)(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + val rxsnp = Flipped(DecoupledIO(new CHISNP())) + val task = DecoupledIO(new TaskBundle()) + val msInfo = Vec(mshrsAll, Flipped(ValidIO(new MSHRInfo()))) + }) + + val task = Wire(new TaskBundle) + + /** + * When should an MSHR with Acquire address of X block/nest an incoming snoop with address X? + * + * 1. Before MSHR receives the first beat of CompData, snoop should be **nested** because snoop has higher priority + * than request according to CHI spec. + * 2. After MSHR receives the first beat of CompData, and before L2 receives GrantAck from L1, snoop of X should be + * **blocked**, because a slave should not issue a Probe if there is a pending GrantAck on the block according + * to TileLink spec. + * 3. Before MSHR sends out WriteBackFull/Evict to write refilled data into DS, snoop should be **blocked**, Because + * the snooped block is still in RefillBuffer rather than DS. + * 4. After MSHR sends out WriteBackFull/Evict and write refilled data into DS, snoop should be **nested**, still + * because snoop has higher priority than request. + */ + val reqBlockSnpMask = VecInit(io.msInfo.map(s => + s.valid && s.bits.set === task.set && s.bits.reqTag === task.tag && + (s.bits.w_grantfirst || s.bits.aliasTask.getOrElse(false.B) && !s.bits.w_rprobeacklast) && + (s.bits.blockRefill || s.bits.w_releaseack) && !s.bits.willFree + )).asUInt + val reqBlockSnp = reqBlockSnpMask.orR + + /** + * When should an MSHR that is goint to replace cacheline Y block/nest an incoming snoop with address Y? + * + * 1. After MSHR decides which way to replace but before MSHR finished all the rProbes, the incoming snoop of Y + * should be **blocked**, because Once the Probe is issued the slave should not issue further Probes on the block + * until it receives a ProbeAck. + * 2. After MSHR receives all the ProbeAcks of rProbe, the snoop of Y should be nested. + */ + val replaceBlockSnpMask = VecInit(io.msInfo.map(s => + s.valid && s.bits.set === task.set && s.bits.metaTag === task.tag && !s.bits.dirHit && isValid(s.bits.metaState) && + s.bits.w_replResp && (!s.bits.w_rprobeacklast || s.bits.w_releaseack) && !s.bits.willFree + )).asUInt + val replaceBlockSnp = replaceBlockSnpMask.orR + val replaceNestSnpMask = VecInit(io.msInfo.map(s => + s.valid && s.bits.set === task.set && s.bits.metaTag === task.tag && !s.bits.dirHit && s.bits.metaState =/= INVALID && + s.bits.w_replResp && s.bits.w_rprobeacklast && !s.bits.w_releaseack + )).asUInt + val replaceDataMask = VecInit(io.msInfo.map(_.bits.replaceData)).asUInt + + task := fromSnpToTaskBundle(io.rxsnp.bits) + + val stall = reqBlockSnp || replaceBlockSnp // addrConflict || replaceConflict + io.task.valid := io.rxsnp.valid && !stall + io.task.bits := task + io.rxsnp.ready := io.task.ready && !stall + + val stallCnt = RegInit(0.U(64.W)) + when(io.rxsnp.fire) { + stallCnt := 0.U + }.elsewhen(io.rxsnp.valid && !io.rxsnp.ready) { + stallCnt := stallCnt + 1.U + } + + val STALL_CNT_MAX = 28000.U + assert(stallCnt <= STALL_CNT_MAX, "stallCnt full! maybe there is a deadlock! addr => 0x%x req_opcode => %d txn_id => %d", io.rxsnp.bits.addr, io.rxsnp.bits.opcode, io.rxsnp.bits.txnID); + + assert(!(stall && io.rxsnp.fire)) + + def fromSnpToTaskBundle(snp: CHISNP): TaskBundle = { + val task = WireInit(0.U.asTypeOf(new TaskBundle)) + task.channel := "b010".U + // Addr in CHI SNP channel has 3 fewer bits than full address + val snpFullAddr = Cat(snp.addr, 0.U(3.W)) + task.tag := parseAddress(snpFullAddr)._1 + task.set := parseAddress(snpFullAddr)._2 + task.off := parseAddress(snpFullAddr)._3 + task.alias.foreach(_ := 0.U) + task.vaddr.foreach(_ := 0.U) + task.isKeyword.foreach(_ := false.B) + // task.opcode := snp.opcode + task.param := 0.U + task.size := log2Up(cacheParams.blockBytes).U + task.sourceId := 0.U(sourceIdBits.W) + task.bufIdx := 0.U(bufIdxBits.W) + task.needProbeAckData := false.B + task.mshrTask := false.B + task.mshrId := 0.U(mshrBits.W) + task.aliasTask.foreach(_ := false.B) + task.useProbeData := false.B + task.mshrRetry := false.B + task.fromL2pft.foreach(_ := false.B) + task.needHint.foreach(_ := false.B) + task.dirty := false.B + task.way := 0.U(wayBits.W) + task.meta := 0.U.asTypeOf(new MetaEntry) + task.metaWen := false.B + task.tagWen := false.B + task.dsWen := false.B + task.wayMask := Fill(cacheParams.ways, "b1".U) + task.reqSource := MemReqSource.NoWhere.id.U + task.replTask := false.B + task.mergeA := false.B + task.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle) + task.snpHitRelease := replaceNestSnpMask.orR + task.snpHitReleaseWithData := (replaceNestSnpMask & replaceDataMask).orR + task.snpHitReleaseIdx := PriorityEncoder(replaceNestSnpMask) + task.tgtID.foreach(_ := 0.U) // TODO + task.srcID.foreach(_ := snp.srcID) + task.txnID.foreach(_ := snp.txnID) + task.dbID.foreach(_ := 0.U) + task.fwdNID.foreach(_ := snp.fwdNID) + task.fwdTxnID.foreach(_ := snp.fwdTxnID) + task.chiOpcode.foreach(_ := snp.opcode) + task.pCrdType.foreach(_ := 0.U) + task.retToSrc.foreach(_ := snp.retToSrc) + task + } + +} diff --git a/src/main/scala/coupledL2/tl2chi/Slice.scala b/src/main/scala/coupledL2/tl2chi/Slice.scala new file mode 100644 index 00000000..a5c54cd4 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/Slice.scala @@ -0,0 +1,207 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import freechips.rocketchip.tilelink._ +import org.chipsalliance.cde.config.Parameters +import coupledL2._ +import coupledL2.prefetch.PrefetchIO + +class Slice()(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + val in = Flipped(TLBundle(edgeIn.bundle)) + val out = new DecoupledPortIO + val sliceId = Input(UInt(bankBits.W)) + val l1Hint = Decoupled(new L2ToL1Hint()) + val waitPCrdInfo = Output(Vec(mshrsAll, new PCrdInfo)) + val prefetch = prefetchOpt.map(_ => Flipped(new PrefetchIO)) + val msStatus = topDownOpt.map(_ => Vec(mshrsAll, ValidIO(new MSHRStatus))) + val dirResult = topDownOpt.map(_ => ValidIO(new DirResult)) + val latePF = topDownOpt.map(_ => Output(Bool())) + }) + + /* Upwards TileLink-related modules */ + val sinkA = Module(new SinkA) + val sinkC = Module(new SinkC) + val grantBuf = Module(new GrantBuffer) + + /* Downwards CHI-related modules */ + val txreq = Module(new TXREQ()) + val txdat = Module(new TXDAT()) + val txrsp = Module(new TXRSP()) + val rxsnp = Module(new RXSNP()) + val rxdat = Module(new RXDAT()) + val rxrsp = Module(new RXRSP()) + + /* Data path and control path */ + val directory = Module(new Directory()) + val dataStorage = Module(new DataStorage()) + val refillBuf = Module(new MSHRBuffer(wPorts = 2)) + val releaseBuf = Module(new MSHRBuffer(wPorts = 3)) + + val reqArb = Module(new RequestArb()) + val mainPipe = Module(new MainPipe()) + val reqBuf = Module(new RequestBuffer()) + val mshrCtl = Module(new MSHRCtl()) + + sinkC.io.msInfo := mshrCtl.io.msInfo + + grantBuf.io.d_task <> mainPipe.io.toSourceD + grantBuf.io.fromReqArb.status_s1 := reqArb.io.status_s1 + grantBuf.io.pipeStatusVec := reqArb.io.status_vec ++ mainPipe.io.status_vec_toD + + val status_vec_toTX = reqArb.io.status_vec_toTX.get ++ mainPipe.io.status_vec_toTX + txreq.io.pipeReq <> mainPipe.io.toTXREQ + txreq.io.mshrReq <> mshrCtl.io.toTXREQ + txreq.io.pipeStatusVec := status_vec_toTX + txreq.io.sliceId := io.sliceId + + txdat.io.in <> mainPipe.io.toTXDAT + txdat.io.pipeStatusVec := status_vec_toTX + + txrsp.io.pipeRsp <> mainPipe.io.toTXRSP + txrsp.io.mshrRsp <> mshrCtl.io.toTXRSP + txrsp.io.pipeStatusVec := status_vec_toTX + + rxsnp.io.msInfo := mshrCtl.io.msInfo + + directory.io.read <> reqArb.io.dirRead_s1 + directory.io.metaWReq := mainPipe.io.metaWReq + directory.io.tagWReq := mainPipe.io.tagWReq + directory.io.msInfo := mshrCtl.io.msInfo + + dataStorage.io.req := mainPipe.io.toDS.req_s3 + dataStorage.io.wdata := mainPipe.io.toDS.wdata_s3 + + reqArb.io.ATag := reqBuf.io.ATag + reqArb.io.ASet := reqBuf.io.ASet + reqArb.io.sinkA <> reqBuf.io.out + reqArb.io.sinkB <> rxsnp.io.task + reqArb.io.sinkC <> sinkC.io.task + reqArb.io.mshrTask <> mshrCtl.io.mshrTask + reqArb.io.fromMSHRCtl := mshrCtl.io.toReqArb + reqArb.io.fromMainPipe := mainPipe.io.toReqArb + reqArb.io.fromGrantBuffer := grantBuf.io.toReqArb + reqArb.io.fromTXDAT.foreach(_ := txdat.io.toReqArb) + reqArb.io.fromTXRSP.foreach(_ := txrsp.io.toReqArb) + reqArb.io.fromTXREQ.foreach(_ := txreq.io.toReqArb) + reqArb.io.msInfo := mshrCtl.io.msInfo + + reqBuf.io.in <> sinkA.io.task + reqBuf.io.mshrInfo := mshrCtl.io.msInfo + reqBuf.io.mainPipeBlock := mainPipe.io.toReqBuf + reqBuf.io.s1Entrance := reqArb.io.s1Entrance + + mainPipe.io.taskFromArb_s2 := reqArb.io.taskToPipe_s2 + mainPipe.io.taskInfo_s1 := reqArb.io.taskInfo_s1 + mainPipe.io.fromReqArb.status_s1 := reqArb.io.status_s1 + mainPipe.io.bufResp := sinkC.io.bufResp + mainPipe.io.dirResp_s3 := directory.io.resp.bits + mainPipe.io.replResp := directory.io.replResp + mainPipe.io.fromMSHRCtl <> mshrCtl.io.toMainPipe + mainPipe.io.bufResp := sinkC.io.bufResp + mainPipe.io.refillBufResp_s3.valid := RegNext(refillBuf.io.r.valid, false.B) + mainPipe.io.refillBufResp_s3.bits := refillBuf.io.resp.data + mainPipe.io.releaseBufResp_s3.valid := RegNext(releaseBuf.io.r.valid, false.B) + mainPipe.io.releaseBufResp_s3.bits := releaseBuf.io.resp.data + mainPipe.io.toDS.rdata_s5 := dataStorage.io.rdata + // mainPipe.io.grantBufferHint := grantBuf.io.l1Hint + // mainPipe.io.globalCounter := grantBuf.io.globalCounter + + mshrCtl.io.fromReqArb.status_s1 := reqArb.io.status_s1 + mshrCtl.io.fromMainPipe <> mainPipe.io.toMSHRCtl + mshrCtl.io.fromMainPipe.mshr_alloc_s3 := mainPipe.io.toMSHRCtl.mshr_alloc_s3 + mshrCtl.io.grantStatus := grantBuf.io.grantStatus + mshrCtl.io.resps.sinkC := sinkC.io.resp + mshrCtl.io.resps.rxrsp := rxrsp.io.in + mshrCtl.io.resps.rxdat := rxdat.io.in + mshrCtl.io.nestedwb := mainPipe.io.nestedwb + mshrCtl.io.replResp := directory.io.replResp + mshrCtl.io.aMergeTask := reqBuf.io.aMergeTask + // TODO: This is ugly + mshrCtl.io.pipeStatusVec(0) := (reqArb.io.status_vec)(1) // s2 status + mshrCtl.io.pipeStatusVec(1) := mainPipe.io.status_vec_toD(0) // s3 status + + /* Read and write release buffer */ + releaseBuf.io.r := reqArb.io.releaseBufRead_s2 + val nestedWriteReleaseBuf, + sinkCWriteReleaseBuf, + mpWriteReleaseBuf = Wire(Valid(new MSHRBufWrite())) + nestedWriteReleaseBuf.valid := mshrCtl.io.nestedwbDataId.valid + nestedWriteReleaseBuf.bits.data := mainPipe.io.nestedwbData + nestedWriteReleaseBuf.bits.id := mshrCtl.io.nestedwbDataId.bits + nestedWriteReleaseBuf.bits.beatMask := Fill(beatSize, true.B) + sinkCWriteReleaseBuf match { case x => + x := sinkC.io.releaseBufWrite + x.bits.id := mshrCtl.io.releaseBufWriteId + } + mpWriteReleaseBuf := mainPipe.io.releaseBufWrite + releaseBuf.io.w <> VecInit(Seq( + nestedWriteReleaseBuf, + sinkCWriteReleaseBuf, + mpWriteReleaseBuf + )) + + /* Read and write refill buffer */ + refillBuf.io.r := reqArb.io.refillBufRead_s2 + refillBuf.io.w <> VecInit(Seq(rxdat.io.refillBufWrite, sinkC.io.refillBufWrite)) + + io.prefetch.foreach { p => + p.train <> mainPipe.io.prefetchTrain.get + sinkA.io.prefetchReq.get <> p.req + p.resp <> grantBuf.io.prefetchResp.get + p.tlb_req.req.ready := true.B + p.tlb_req.resp.valid := false.B + p.tlb_req.resp.bits := DontCare + p.recv_addr := 0.U.asTypeOf(p.recv_addr) + } + + /* to Slice Top for pCrd info.*/ + io.waitPCrdInfo <> mshrCtl.io.waitPCrdInfo + + /* IO Connection */ + io.l1Hint <> mainPipe.io.l1Hint + topDownOpt.foreach ( + _ => { + io.msStatus.get := mshrCtl.io.msStatus.get + io.dirResult.get.valid := directory.io.resp.valid && !directory.io.replResp.valid // exclude MSHR-Grant read-dir + io.dirResult.get.bits := directory.io.resp.bits + io.latePF.get := reqBuf.io.hasLatePF + } + ) + + /* Connect upwards channels */ + val inBuf = cacheParams.innerBuf + // val outBuf = tl2tlParams.outerBuf + sinkA.io.a <> inBuf.a(io.in.a) + io.in.b <> inBuf.b(mshrCtl.io.toSourceB) + sinkC.io.c <> inBuf.c(io.in.c) + io.in.d <> inBuf.d(grantBuf.io.d) + grantBuf.io.e <> inBuf.e(io.in.e) + + /* Connect downwards channels */ + io.out.tx.req <> txreq.io.out + io.out.tx.dat <> txdat.io.out + io.out.tx.rsp <> txrsp.io.out + rxsnp.io.rxsnp <> io.out.rx.snp + rxdat.io.out <> io.out.rx.dat + rxrsp.io.out <> io.out.rx.rsp + +} diff --git a/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala b/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala new file mode 100644 index 00000000..ba07892d --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala @@ -0,0 +1,492 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import utility.{FastArbiter, Pipeline, ParallelPriorityMux, RegNextN} +import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.tilelink.TLMessages._ +import freechips.rocketchip.util._ +import org.chipsalliance.cde.config.{Parameters, Field} +import scala.math.max +import coupledL2._ +import coupledL2.tl2chi.CHIOpcode.RSPOpcodes._ +import coupledL2.prefetch._ +import coupledL2.utils.XSPerfAccumulate + +abstract class TL2CHIL2Bundle(implicit val p: Parameters) extends Bundle + with HasCoupledL2Parameters + with HasCHIMsgParameters +abstract class TL2CHIL2Module(implicit val p: Parameters) extends Module + with HasCoupledL2Parameters + with HasCHIMsgParameters + +class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base { + + /** + * Make diplomacy happy: + * To implement multi-bank L2, a BankBinder must be placed downstream of L2, + * therefore a TLAdapterNode is implemented here. + */ + val managerPortParams = (m: TLSlavePortParameters) => TLSlavePortParameters.v1( + m.managers.map { m => + m.v2copy( + regionType = if (m.regionType >= RegionType.UNCACHED) RegionType.CACHED else m.regionType, + supports = TLMasterToSlaveTransferSizes( + acquireB = xfer, + acquireT = if (m.supportsAcquireT) xfer else TransferSizes.none, + arithmetic = if (m.supportsAcquireT) atom else TransferSizes.none, + logical = if (m.supportsAcquireT) atom else TransferSizes.none, + get = access, + putFull = if (m.supportsAcquireT) access else TransferSizes.none, + putPartial = if (m.supportsAcquireT) access else TransferSizes.none, + hint = access + ), + fifoId = None + ) + }, + beatBytes = 32, + minLatency = 2, + responseFields = cacheParams.respField, + requestKeys = cacheParams.reqKey, + endSinkId = idsAll + ) + + val clientPortParams = (m: TLMasterPortParameters) => TLMasterPortParameters.v2( + Seq( + TLMasterParameters.v2( + name = cacheParams.name, + supports = TLSlaveToMasterTransferSizes( + probe = xfer + ), + sourceId = IdRange(0, idsAll) + ) + ), + channelBytes = cacheParams.channelBytes, + minLatency = 1, + echoFields = cacheParams.echoField, + requestFields = cacheParams.reqField, + responseKeys = cacheParams.respKey + ) + + val node = TLAdapterNode( + clientFn = clientPortParams, + managerFn = managerPortParams + ) + + val addressRange = AddressSet(0x00000000L, 0xfffffffffL).subtract(AddressSet(0x0L, 0x7fffffffL)) // TODO: parameterize this + val managerParameters = TLSlavePortParameters.v1( + managers = Seq(TLSlaveParameters.v1( + address = addressRange, + regionType = RegionType.CACHED, + supportsAcquireT = xfer, + supportsAcquireB = xfer, + supportsArithmetic = atom, + supportsLogical = atom, + supportsGet = access, + supportsPutFull = access, + supportsPutPartial = access, + supportsHint = access, + fifoId = None + )), + beatBytes = 32, + minLatency = 2, + responseFields = cacheParams.respField, + requestKeys = cacheParams.reqKey, + endSinkId = idsAll // TODO: Confirm this + ) + val managerNode = TLManagerNode(Seq(managerParameters)) + + val mmioBridge = LazyModule(new MMIOBridge) + val mmioNode = mmioBridge.mmioNode + + class CoupledL2Imp(wrapper: LazyModule) extends LazyModuleImp(wrapper) { + val banks = node.in.size + val bankBits = log2Ceil(banks) + val l2TlbParams: Parameters = p.alterPartial { + case EdgeInKey => node.in.head._2 + case EdgeOutKey => node.out.head._2 + case BankBitsKey => bankBits + } + + val io = IO(new Bundle { + val hartId = Input(UInt(hartIdLen.W)) + val l2_hint = ValidIO(new L2ToL1Hint()) + val l2_tlb_req = new L2ToL1TlbIO(nRespDups = 1)(l2TlbParams) + val debugTopDown = new Bundle { + val robTrueCommit = Input(UInt(64.W)) + val robHeadPaddr = Flipped(Valid(UInt(36.W))) + val l2MissMatch = Output(Bool()) + } + val chi = new PortIO + val nodeID = Input(UInt()) + }) + + // Display info + val sizeBytes = cacheParams.toCacheParams.capacity.toDouble + val sizeStr = sizeBytesToStr(sizeBytes) + val prefetch = "prefetch: " + cacheParams.prefetch + println(s"====== Inclusive TL-CHI ${cacheParams.name} ($sizeStr * $banks-bank) $prefetch ======") + println(s"bankBits: ${bankBits}") + println(s"replacement: ${cacheParams.replacement}") + println(s"sets:${cacheParams.sets} ways:${cacheParams.ways} blockBytes:${cacheParams.blockBytes}") + print_bundle_fields(node.in.head._2.bundle.requestFields, "usr") + print_bundle_fields(node.in.head._2.bundle.echoFields, "echo") + println(s"CHI REQ Width: ${(new CHIREQ).getWidth}") + println(s"CHI RSP Width: ${(new CHIRSP).getWidth}") + println(s"CHI SNP Width: ${(new CHISNP).getWidth}") + println(s"CHI DAT Width: ${(new CHIDAT).getWidth}") + println(s"CHI Port Width: ${io.chi.getWidth}") + + println(s"Cacheable:") + node.edges.in.headOption.foreach { n => + n.client.clients.zipWithIndex.foreach { + case (c, i) => + println(s"\t${i} <= ${c.name};" + + s"\tsourceRange: ${c.sourceId.start}~${c.sourceId.end}") + } + } + println(s"MMIO:") + mmioNode.edges.in.headOption.foreach { n => + n.client.clients.zipWithIndex.foreach { + case (c, i) => + println(s"\t${i} <= ${c.name};" + + s"\tsourceRange: ${c.sourceId.start}~${c.sourceId.end}") + } + } + + // Connection between prefetcher and the slices + val pftParams: Parameters = p.alterPartial { + case EdgeInKey => node.in.head._2 + case EdgeOutKey => node.out.head._2 + case BankBitsKey => bankBits + } + val prefetcher = prefetchOpt.map(_ => Module(new Prefetcher()(pftParams))) + val prefetchTrains = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchTrain()(pftParams))))) + val prefetchResps = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchResp()(pftParams))))) + val prefetchReqsReady = WireInit(VecInit(Seq.fill(banks)(false.B))) + io.l2_tlb_req <> DontCare // TODO: l2_tlb_req should be Option + prefetchOpt.foreach { + _ => + fastArb(prefetchTrains.get, prefetcher.get.io.train, Some("prefetch_train")) + prefetcher.get.io.req.ready := Cat(prefetchReqsReady).orR + prefetcher.get.hartId := io.hartId + fastArb(prefetchResps.get, prefetcher.get.io.resp, Some("prefetch_resp")) + prefetcher.get.io.tlb_req <> io.l2_tlb_req + } + pf_recv_node match { + case Some(x) => + prefetcher.get.io.recv_addr.valid := x.in.head._1.addr_valid + prefetcher.get.io.recv_addr.bits.addr := x.in.head._1.addr + prefetcher.get.io.recv_addr.bits.pfSource := x.in.head._1.pf_source + prefetcher.get.io_l2_pf_en := x.in.head._1.l2_pf_en + case None => + prefetcher.foreach{ + p => + p.io.recv_addr := 0.U.asTypeOf(p.io.recv_addr) + p.io_l2_pf_en := false.B + } + } + + // TODO: Remove this to utility or HasCoupledL2Parameters + def bank_eq(set: UInt, bankId: Int, bankBits: Int): Bool = { + if(bankBits == 0) true.B else set(bankBits - 1, 0) === bankId.U + } + + // ** WARNING:TODO: this depends on where the latch is + // ** if Hint latched in slice, while D-Channel latched in XSTile + // ** we need only [hintCycleAhead - 1] later + val sliceAhead = hintCycleAhead - 1 + + val hintChosen = Wire(UInt(banks.W)) + val hintFire = Wire(Bool()) + + // if Hint indicates that this slice should fireD, yet no D resp comes out of this slice + // then we releaseSourceD, enabling io.d.ready for other slices + // TODO: if Hint for single slice is 100% accurate, may consider remove this + val releaseSourceD = Wire(Vec(banks, Bool())) + val allCanFire = (RegNextN(!hintFire, sliceAhead) && RegNextN(!hintFire, sliceAhead + 1)) || Cat(releaseSourceD).orR + + val slices = node.in.zipWithIndex.map { + case ((in, edgeIn), i) => + val rst_L2 = reset + val slice = withReset(rst_L2) { + Module(new Slice()(p.alterPartial { + case EdgeInKey => edgeIn + case BankBitsKey => bankBits + case SliceIdKey => i + })) + } + slice.io.in <> in + if (enableHintGuidedGrant) { + // If the hint of slice X is selected at cycle T, then at cycle (T + 3) & (T + 4) + // we will try our best to select the grant of slice X. + // If slice X has no grant then, it means that the hint at cycle T is wrong, + // so we relax the restriction on grant selection. + val sliceCanFire = RegNextN(hintFire && i.U === hintChosen, sliceAhead) || + RegNextN(hintFire && i.U === hintChosen, sliceAhead + 1) + + releaseSourceD(i) := sliceCanFire && !slice.io.in.d.valid + + in.d.valid := slice.io.in.d.valid && (sliceCanFire || allCanFire) + slice.io.in.d.ready := in.d.ready && (sliceCanFire || allCanFire) + } + in.b.bits.address := restoreAddress(slice.io.in.b.bits.address, i) + slice.io.sliceId := i.U + + slice.io.prefetch.zip(prefetcher).foreach { + case (s, p) => + s.req.valid := p.io.req.valid && bank_eq(p.io.req.bits.set, i, bankBits) + s.req.bits := p.io.req.bits + prefetchReqsReady(i) := s.req.ready && bank_eq(p.io.req.bits.set, i, bankBits) + val train = Pipeline(s.train) + val resp = Pipeline(s.resp) + prefetchTrains.get(i) <> train + prefetchResps.get(i) <> resp + // restore to full address + if(bankBits != 0){ + val train_full_addr = Cat( + train.bits.tag, train.bits.set, i.U(bankBits.W), 0.U(offsetBits.W) + ) + val (train_tag, train_set, _) = s.parseFullAddress(train_full_addr) + val resp_full_addr = Cat( + resp.bits.tag, resp.bits.set, i.U(bankBits.W), 0.U(offsetBits.W) + ) + val (resp_tag, resp_set, _) = s.parseFullAddress(resp_full_addr) + prefetchTrains.get(i).bits.tag := train_tag + prefetchTrains.get(i).bits.set := train_set + prefetchResps.get(i).bits.tag := resp_tag + prefetchResps.get(i).bits.set := resp_set + } + s.tlb_req.req.valid := false.B + s.tlb_req.req.bits := DontCare + s.tlb_req.req_kill := DontCare + s.tlb_req.resp.ready := true.B + } + + slice + } + + if (enableHintGuidedGrant) { + // for timing consideration, hint should latch one cycle before sending to L1 + // instead of adding a Pipeline/Queue to latch here, we just set hintQueue in GrantBuf & CustomL1Hint "flow=false" + val l1HintArb = Module(new Arbiter(new L2ToL1Hint(), slices.size)) + val slices_l1Hint = slices.zipWithIndex.map { + case (s, i) => s.io.l1Hint + } + // should only Hint for DCache + val (sourceIsDcache, dcacheSourceIdStart) = node.in.head._2.client.clients + .filter(_.supports.probe) + .map(c => { + (c.sourceId.contains(l1HintArb.io.out.bits.sourceId).asInstanceOf[Bool], c.sourceId.start.U) + }).head + + l1HintArb.io.in <> VecInit(slices_l1Hint) + io.l2_hint.valid := l1HintArb.io.out.fire && sourceIsDcache + io.l2_hint.bits.sourceId := l1HintArb.io.out.bits.sourceId - dcacheSourceIdStart + io.l2_hint.bits.isKeyword := l1HintArb.io.out.bits.isKeyword + // continuous hints can only be sent every two cycle, since GrantData takes two cycles + l1HintArb.io.out.ready := !RegNext(io.l2_hint.valid, false.B) + + hintChosen := l1HintArb.io.chosen // ! THIS IS NOT ONE-HOT ! + hintFire := io.l2_hint.valid + } + + /** + * TxnID space arrangement: + * If this is a cacheable request: + * +----------------+-----------+---------------+ + * | 0.U(1.W) | SliceID | Inner TxnID | + * +----------------+-----------+---------------+ + * Otherwise this is an MMIO request: + * +----------------+-----------+---------------+ + * | 1.U(1.W) | Inner TxnID | + * +----------------+---------------------------+ + * + */ + def setSliceID(txnID: UInt, sliceID: UInt, mmio: Bool): UInt = { + Mux( + mmio, + Cat(1.U(1.W), txnID.tail(1)), + Cat(0.U(1.W), if (banks <= 1) txnID.tail(1) else Cat(sliceID(bankBits - 1, 0), txnID.tail(bankBits + 1))) + ) + } + def getSliceID(txnID: UInt): UInt = if (banks <= 1) 0.U else txnID.tail(1).head(bankBits) + def restoreTXNID(txnID: UInt): UInt = { + val mmio = txnID.head(1).asBool + Mux( + mmio || (banks <= 1).B, + Cat(0.U(1.W), txnID.tail(1)), + Cat(0.U(1.W), 0.U(bankBits.W), txnID.tail(bankBits + 1)) + ) + } + val mmio = mmioBridge.module + + // TXREQ + val txreq_arb = Module(new Arbiter(new CHIREQ, slices.size + 1)) // plus 1 for MMIO + val txreq = Wire(DecoupledIO(new CHIREQ)) + slices.zip(txreq_arb.io.in.init).foreach { case (s, in) => in <> s.io.out.tx.req } + txreq_arb.io.in.last <> mmio.io.tx.req + txreq <> txreq_arb.io.out + txreq.bits.txnID := setSliceID(txreq_arb.io.out.bits.txnID, txreq_arb.io.chosen, mmio.io.tx.req.fire) + + // TXRSP + val txrsp = Wire(DecoupledIO(new CHIRSP)) + arb(slices.map(_.io.out.tx.rsp), txrsp, Some("txrsp")) + + // TXDAT + val txdat = Wire(DecoupledIO(new CHIDAT)) + arb(slices.map(_.io.out.tx.dat) :+ mmio.io.tx.dat, txdat, Some("txdat")) + + // RXSNP + val rxsnp = Wire(DecoupledIO(new CHISNP)) + val rxsnpSliceID = if (banks <= 1) 0.U else (rxsnp.bits.addr >> (offsetBits - 3))(bankBits - 1, 0) + slices.zipWithIndex.foreach { case (s, i) => + s.io.out.rx.snp.valid := rxsnp.valid && rxsnpSliceID === i.U + s.io.out.rx.snp.bits := rxsnp.bits + } + rxsnp.ready := Cat(slices.zipWithIndex.map { case (s, i) => s.io.out.rx.snp.ready && rxsnpSliceID === i.U }).orR + + // RXRSP + val rxrsp = Wire(DecoupledIO(new CHIRSP)) + val rxrspIsMMIO = rxrsp.bits.txnID.head(1).asBool + val isPCrdGrant = rxrsp.valid && (rxrsp.bits.opcode === PCrdGrant) + val pArb = Module(new RRArbiter(UInt(), banks)) + /* + when PCrdGrant, give credit to one Slice that: + 1. got RetryAck and not Reissued + 2. match srcID and PCrdType + 3. use Round-Robin arbiter if multi-Slice match + */ + val matchPCrdGrant = Wire(Vec(banks, UInt(mshrsAll.W))) + slices.zipWithIndex.foreach { case (s, i) => + matchPCrdGrant(i) := VecInit(s.io.waitPCrdInfo.map(p => + p.valid && isPCrdGrant && + p.srcID.get === rxrsp.bits.srcID && + p.pCrdType.get === rxrsp.bits.pCrdType + )).asUInt + } + val pCrdIsWait = VecInit(matchPCrdGrant.map(_.asUInt.orR)).asUInt + + pArb.io.in.zipWithIndex.foreach { + case (in, i) => + in.valid := pCrdIsWait(i) + in.bits := 0.U + } + pArb.io.out.ready := true.B + val pCrdSliceID = pArb.io.chosen +// val pCrdSliceID = PriorityEncoder(pCrdIsWait) +// val rxrspSliceID = getSliceID(rxrsp.bits.txnID) + val rxrspSliceID = Mux(isPCrdGrant, pCrdSliceID, getSliceID(rxrsp.bits.txnID)) + slices.zipWithIndex.foreach { case (s, i) => + s.io.out.rx.rsp.valid := rxrsp.valid && rxrspSliceID === i.U && !rxrspIsMMIO + s.io.out.rx.rsp.bits := rxrsp.bits + s.io.out.rx.rsp.bits.txnID := restoreTXNID(rxrsp.bits.txnID) + } + mmio.io.rx.rsp.valid := rxrsp.valid && rxrspIsMMIO + mmio.io.rx.rsp.bits := rxrsp.bits + mmio.io.rx.rsp.bits.txnID := restoreTXNID(rxrsp.bits.txnID) + rxrsp.ready := Mux( + rxrspIsMMIO, + mmio.io.rx.rsp.ready, + Cat(slices.zipWithIndex.map { case (s, i) => s.io.out.rx.rsp.ready && rxrspSliceID === i.U }).orR + ) + + // RXDAT + val rxdat = Wire(DecoupledIO(new CHIDAT)) + val rxdatIsMMIO = rxdat.bits.txnID.head(1).asBool + val rxdatSliceID = getSliceID(rxdat.bits.txnID) + slices.zipWithIndex.foreach { case (s, i) => + s.io.out.rx.dat.valid := rxdat.valid && rxdatSliceID === i.U && !rxdatIsMMIO + s.io.out.rx.dat.bits := rxdat.bits + s.io.out.rx.dat.bits.txnID := restoreTXNID(rxdat.bits.txnID) + } + mmio.io.rx.dat.valid := rxdat.valid && rxdatIsMMIO + mmio.io.rx.dat.bits := rxdat.bits + mmio.io.rx.dat.bits.txnID := restoreTXNID(rxdat.bits.txnID) + rxdat.ready := Mux( + rxdatIsMMIO, + mmio.io.rx.dat.ready, + Cat(slices.zipWithIndex.map { case (s, i) => s.io.out.rx.dat.ready && rxdatSliceID === i.U}).orR + ) + + val linkMonitor = Module(new LinkMonitor) + linkMonitor.io.in.tx.req <> txreq + linkMonitor.io.in.tx.rsp <> txrsp + linkMonitor.io.in.tx.dat <> txdat + rxsnp <> linkMonitor.io.in.rx.snp + rxrsp <> linkMonitor.io.in.rx.rsp + rxdat <> linkMonitor.io.in.rx.dat + io.chi <> linkMonitor.io.out + linkMonitor.io.nodeID := io.nodeID + + // ==================== TopDown ==================== + val topDown = topDownOpt.map(_ => Module(new TopDownMonitor()(p.alterPartial { + case EdgeInKey => node.in.head._2 + case BankBitsKey => bankBits + }))) + topDown match { + case Some(t) => + t.io.msStatus.zip(slices).foreach { + case (in, s) => in := s.io.msStatus.get + } + t.io.dirResult.zip(slices).foreach { + case (res, s) => res := s.io.dirResult.get + } + t.io.latePF.zip(slices).foreach { + case (in, s) => in := s.io.latePF.get + } + t.io.debugTopDown <> io.debugTopDown + case None => io.debugTopDown.l2MissMatch := false.B + } + + // ==================== XSPerf Counters ==================== + val grant_data_fire = slices.map { slice => { + val (first, _, _, _) = node.in.head._2.count(slice.io.in.d) + slice.io.in.d.fire && first && slice.io.in.d.bits.opcode === GrantData + } + } + XSPerfAccumulate(cacheParams, "grant_data_fire", PopCount(VecInit(grant_data_fire))) + + val hint_source = io.l2_hint.bits.sourceId + + val grant_data_source = ParallelPriorityMux(slices.map { + s => (s.io.in.d.fire, s.io.in.d.bits.source) + }) + + val hintPipe2 = Module(new Pipeline(UInt(32.W), 2)) + hintPipe2.io.in.valid := io.l2_hint.valid + hintPipe2.io.in.bits := hint_source + hintPipe2.io.out.ready := true.B + + val hintPipe1 = Module(new Pipeline(UInt(32.W), 1)) + hintPipe1.io.in.valid := io.l2_hint.valid + hintPipe1.io.in.bits := hint_source + hintPipe1.io.out.ready := true.B + + val accurateHint = grant_data_fire.orR && hintPipe2.io.out.valid && hintPipe2.io.out.bits === grant_data_source + XSPerfAccumulate(cacheParams, "accurate3Hints", accurateHint) + + val okHint = grant_data_fire.orR && hintPipe1.io.out.valid && hintPipe1.io.out.bits === grant_data_source + XSPerfAccumulate(cacheParams, "ok2Hints", okHint) + } + + lazy val module = new CoupledL2Imp(this) +} diff --git a/src/main/scala/coupledL2/tl2chi/TXDAT.scala b/src/main/scala/coupledL2/tl2chi/TXDAT.scala new file mode 100644 index 00000000..8cdb3368 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/TXDAT.scala @@ -0,0 +1,132 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import utility._ +import org.chipsalliance.cde.config.Parameters +import coupledL2.{TaskWithData, TaskBundle} + +class TXDATBlockBundle(implicit p: Parameters) extends TXBlockBundle { + val blockSinkBReqEntrance = Bool() + + override def apply() = 0.U.asTypeOf(this) +} + +class TXDAT(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + val in = Flipped(DecoupledIO(new TaskWithData())) + val out = DecoupledIO(new CHIDAT()) + + val pipeStatusVec = Flipped(Vec(5, ValidIO(new PipeStatusWithCHI))) + val toReqArb = Output(new TXDATBlockBundle) + }) + + assert(!io.in.valid || io.in.bits.task.toTXDAT, "txChannel is wrong for TXDAT") + assert(io.in.ready, "TXDAT should never be full") + require(chiOpt.isDefined) + require(beatBytes * 8 == DATA_WIDTH) + + // TODO: an mshrsAll-entry queue is too much, evaluate for a proper size later + val queue = Module(new Queue(io.in.bits.cloneType, entries = mshrsAll, flow = true)) + queue.io.enq <> io.in + + // Back pressure logic from TXDAT + val queueCnt = queue.io.count + // TODO: this may be imprecise, review this later + val pipeStatus_s1_s5 = io.pipeStatusVec + val pipeStatus_s1_s2 = pipeStatus_s1_s5.take(2) + val pipeStatus_s2 = pipeStatus_s1_s2.tail + val pipeStatus_s3_s5 = pipeStatus_s1_s5.drop(2) + // inflightCnt equals the number of reqs on s2~s5 that may flow into TXDAT soon, plus queueCnt. + // The calculation of inflightCnt might be imprecise and leads to false positive back pressue. + val inflightCnt = PopCount(Cat(pipeStatus_s3_s5.map(s => s.valid && s.bits.toTXDAT && (s.bits.fromB || s.bits.mshrTask)))) + + PopCount(Cat(pipeStatus_s2.map(s => s.valid && Mux(s.bits.mshrTask, s.bits.toTXDAT, s.bits.fromB)))) + + queueCnt + val noSpaceForSinkBReq = inflightCnt >= mshrsAll.U + val noSpaceForMSHRReq = inflightCnt >= (mshrsAll-1).U + + io.toReqArb.blockSinkBReqEntrance := noSpaceForSinkBReq + io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq + + val beatValids = RegInit(VecInit(Seq.fill(beatSize)(false.B))) + val taskValid = beatValids.asUInt.orR + val taskR = RegInit(0.U.asTypeOf(new TaskWithData)) + + val dequeueReady = !taskValid // TODO: this may introduce bubble? + queue.io.deq.ready := dequeueReady + when (queue.io.deq.fire) { + beatValids.foreach(_ := true.B) + taskR := queue.io.deq.bits + } + + val data = taskR.data.data + val beatsOH = beatValids.asUInt + val (beat, next_beatsOH) = getBeat(data, beatsOH) + + io.out.valid := taskValid + io.out.bits := toCHIDATBundle(taskR.task, beat, beatsOH) + + when (io.out.fire) { + beatValids := VecInit(next_beatsOH.asBools) + } + + def getBeat(data: UInt, beatsOH: UInt): (UInt, UInt) = { + // get one beat from data according to beatsOH + require(data.getWidth == (blockBytes * 8)) + require(beatsOH.getWidth == beatSize) + // next beat + val next_beat = ParallelPriorityMux(beatsOH, data.asTypeOf(Vec(beatSize, UInt((beatBytes * 8).W)))) + val selOH = PriorityEncoderOH(beatsOH) + // remaining beats that haven't been sent out + val next_beatsOH = beatsOH & ~selOH + (next_beat, next_beatsOH) + } + + def toCHIDATBundle(task: TaskBundle, beat: UInt, beatsOH: UInt): CHIDAT = { + val dat = WireInit(0.U.asTypeOf(new CHIDAT())) + + // width parameters and width check + require(beat.getWidth == dat.data.getWidth) + val beatOffsetWidth = log2Up(beatBytes) + val chunkOffsetWidth = log2Up(16) // DataID is assigned with the granularity of a 16-byte chunk + + dat.tgtID := task.tgtID.get + dat.srcID := task.srcID.get + dat.txnID := task.txnID.get + dat.homeNID := task.homeNID.get + dat.dbID := task.dbID.get + dat.opcode := task.chiOpcode.get + dat.ccID := 0.U // TODO: consider critical chunk id + // The DataID field value must be set to Addr[5:4] because the DataID field represents Addr[5:4] of the lowest + // addressed byte within the packet. + // dat.dataID := ParallelPriorityMux(beatsOH.asBools.zipWithIndex.map(x => (x._1, (x._2 << beatOffsetWidth).U(5, 4)))) + dat.dataID := ParallelPriorityMux( + beatsOH, + List.tabulate(beatSize)(i => (i << (beatOffsetWidth - chunkOffsetWidth)).U) + ) + dat.be := Fill(BE_WIDTH, 1.U(1.W)) + dat.data := beat + dat.resp := task.resp.get + dat.fwdState := task.fwdState.get + + dat + } + +} \ No newline at end of file diff --git a/src/main/scala/coupledL2/tl2chi/TXREQ.scala b/src/main/scala/coupledL2/tl2chi/TXREQ.scala new file mode 100644 index 00000000..76fdc989 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/TXREQ.scala @@ -0,0 +1,76 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import utility._ +import org.chipsalliance.cde.config.Parameters + +class TXBlockBundle(implicit p: Parameters) extends TL2CHIL2Bundle { + // val blockSinkBReqEntrance = Bool() + val blockMSHRReqEntrance = Bool() + + def apply() = 0.U.asTypeOf(this) +} + +class TXREQ(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + val pipeReq = Flipped(DecoupledIO(new CHIREQ())) + val mshrReq = Flipped(DecoupledIO(new CHIREQ())) + val out = DecoupledIO(new CHIREQ()) + + val pipeStatusVec = Flipped(Vec(5, ValidIO(new PipeStatusWithCHI))) + val toReqArb = Output(new TXBlockBundle) + + val sliceId = Input(UInt(bankBits.W)) + }) + + assert(!io.pipeReq.valid || io.pipeReq.ready, "TXREQ should always be ready for pipeline req") + require(chiOpt.isDefined) + + // TODO: an mshrsAll-entry queue is too much, evaluate for a proper size later + val queue = Module(new Queue(new CHIREQ, entries = mshrsAll, flow = true)) + + // Back pressure logic from TXREQ + val queueCnt = queue.io.count + // TODO: this may be imprecise, review this later + val pipeStatus_s1_s5 = io.pipeStatusVec + val pipeStatus_s2_s5 = pipeStatus_s1_s5.tail + val pipeStatus_s1 = pipeStatus_s1_s5.head + // inflightCnt equals the number of reqs on s2~s5 that may flow into TXREQ soon, plus queueCnt. + // The calculation of inflightCnt might be imprecise and leads to false positive back pressue. + val inflightCnt = PopCount(Cat(pipeStatus_s2_s5.map(s => s.valid && s.bits.mshrTask && s.bits.toTXREQ))) + + pipeStatus_s1.valid.asUInt + + queueCnt + val noSpace = inflightCnt >= mshrsAll.U + + io.toReqArb.blockMSHRReqEntrance := noSpace + + queue.io.enq.valid := io.pipeReq.valid || io.mshrReq.valid && !noSpace + queue.io.enq.bits := Mux(io.pipeReq.valid, io.pipeReq.bits, io.mshrReq.bits) + + io.pipeReq.ready := true.B + io.mshrReq.ready := !io.pipeReq.valid && !noSpace + + // Decoupled2LCredit(queue.io.deq, io.out) + io.out <> queue.io.deq + io.out.bits.tgtID := SAM(sam).lookup(io.out.bits.addr) + io.out.bits.size := log2Ceil(blockBytes).U(SIZE_WIDTH.W) // TODO + io.out.bits.addr := restoreAddressUInt(queue.io.deq.bits.addr, io.sliceId) +} \ No newline at end of file diff --git a/src/main/scala/coupledL2/tl2chi/TXRSP.scala b/src/main/scala/coupledL2/tl2chi/TXRSP.scala new file mode 100644 index 00000000..eaac0b98 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/TXRSP.scala @@ -0,0 +1,91 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import utility._ +import org.chipsalliance.cde.config.Parameters +import coupledL2.TaskBundle + +class TXRSPBlockBundle(implicit p: Parameters) extends TXBlockBundle { + val blockSinkBReqEntrance = Bool() + + override def apply() = 0.U.asTypeOf(this) +} + +class TXRSP(implicit p: Parameters) extends TL2CHIL2Module { + val io = IO(new Bundle() { + // val in = Flipped(DecoupledIO(new TaskBundle())) + val pipeRsp = Flipped(DecoupledIO(new TaskBundle)) + val mshrRsp = Flipped(DecoupledIO(new CHIRSP())) + val out = DecoupledIO(new CHIRSP()) + + val pipeStatusVec = Flipped(Vec(5, ValidIO(new PipeStatusWithCHI))) + val toReqArb = Output(new TXRSPBlockBundle) + }) + + assert(!io.pipeRsp.valid || io.pipeRsp.bits.toTXRSP, "txChannel is wrong for TXRSP") + assert(io.pipeRsp.ready, "TXRSP should never be full") + require(chiOpt.isDefined) + + // TODO: an mshrsAll-entry queue is too much, evaluate for a proper size later + val queue = Module(new Queue(new CHIRSP, entries = mshrsAll, flow = true)) + + // Back pressure logic from TXRSP + val queueCnt = queue.io.count + // TODO: this may be imprecise, review this later + val pipeStatus_s1_s5 = io.pipeStatusVec + val pipeStatus_s1_s2 = pipeStatus_s1_s5.take(2) + val pipeStatus_s2 = pipeStatus_s1_s2.tail + val pipeStatus_s3_s5 = pipeStatus_s1_s5.drop(2) + // inflightCnt equals the number of reqs on s2~s5 that may flow into TXRSP soon, plus queueCnt. + // The calculation of inflightCnt might be imprecise and leads to false positive back pressue. + val inflightCnt = PopCount(Cat(pipeStatus_s3_s5.map(s => s.valid && s.bits.toTXRSP && (s.bits.fromB || s.bits.mshrTask)))) + + PopCount(Cat(pipeStatus_s2.map(s => s.valid && Mux(s.bits.mshrTask, s.bits.toTXRSP, s.bits.fromB)))) + + queueCnt + val noSpaceForSinkBReq = inflightCnt >= mshrsAll.U + val noSpaceForMSHRReq = inflightCnt >= (mshrsAll-1).U + + io.toReqArb.blockSinkBReqEntrance := noSpaceForSinkBReq + io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq + + io.out.valid := queue.io.deq.valid + io.out.bits := queue.io.deq.bits + queue.io.deq.ready := io.out.ready + + queue.io.enq.valid := io.pipeRsp.valid || io.mshrRsp.valid && !noSpaceForSinkBReq && !noSpaceForMSHRReq + queue.io.enq.bits := Mux(io.pipeRsp.valid, toCHIRSPBundle(io.pipeRsp.bits), io.mshrRsp.bits) + + io.pipeRsp.ready := true.B + io.mshrRsp.ready := !io.pipeRsp.valid && !noSpaceForSinkBReq && !noSpaceForMSHRReq + + def toCHIRSPBundle(task: TaskBundle): CHIRSP = { + val rsp = WireInit(0.U.asTypeOf(new CHIRSP())) + rsp.tgtID := task.tgtID.get + rsp.srcID := task.srcID.get + rsp.txnID := task.txnID.get + rsp.dbID := task.dbID.get + rsp.pCrdType := task.pCrdType.get + rsp.opcode := task.chiOpcode.get + rsp.resp := task.resp.get + rsp.fwdState := task.fwdState.get + // TODO: Finish this + rsp + } +} \ No newline at end of file diff --git a/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala b/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala new file mode 100644 index 00000000..1fd13520 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala @@ -0,0 +1,306 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.cde.config.Parameters +import coupledL2.L2Module + +class ChannelIO[+T <: Data](gen: T) extends Bundle { + // Flit Pending. Early indication that a flit might be transmitted in the following cycle + val flitpend = Output(Bool()) // To be confirmed: can this be ignored? + // Flit Valid. The transmitter sets the signal HIGH to indicate when FLIT[(W-1):0] is valid. + val flitv = Output(Bool()) + // Flit. + val flit = Output(UInt(gen.getWidth.W)) + // L-Credit Valid. The receiver sets this signal HIGH to return a channel L-Credit to a transmitter. + val lcrdv = Input(Bool()) +} + +object ChannelIO { + def apply[T <: Data](gen: T): ChannelIO[T] = new ChannelIO(gen) + + private final class EmptyBundle extends Bundle + def apply(): ChannelIO[Data] = apply(new EmptyBundle) +} + +trait HasLinkSwitch { this: Bundle => + val linkactivereq = Output(Bool()) + val linkactiveack = Input(Bool()) +} + +trait HasPortSwitch { this: Bundle => + val txsactive = Output(Bool()) + val rxsactive = Input(Bool()) +} + +class DownwardsLinkIO extends Bundle with HasLinkSwitch { + val req = ChannelIO(new CHIREQ) + val rsp = ChannelIO(new CHIRSP) + val dat = ChannelIO(new CHIDAT) +} + +class UpwardsLinkIO extends Bundle with HasLinkSwitch { + val rsp = ChannelIO(new CHIRSP) + val dat = ChannelIO(new CHIDAT) + val snp = ChannelIO(new CHISNP) +} + +class DecoupledDownwardsLinkIO extends Bundle { + val req = DecoupledIO(new CHIREQ) + val rsp = DecoupledIO(new CHIRSP) + val dat = DecoupledIO(new CHIDAT) +} + +class DecoupledUpwardsLinkIO extends Bundle { + val rsp = DecoupledIO(new CHIRSP) + val dat = DecoupledIO(new CHIDAT) + val snp = DecoupledIO(new CHISNP) +} + +class DecoupledDownwardsNoSnpLinkIO extends Bundle { + val req = DecoupledIO(new CHIREQ) + val dat = DecoupledIO(new CHIDAT) +} + +class DecoupledUpwardsNoSnpLinkIO extends Bundle { + val rsp = DecoupledIO(new CHIRSP) + val dat = DecoupledIO(new CHIDAT) +} + +class PortIO extends Bundle with HasPortSwitch { + val tx = new DownwardsLinkIO + val rx = Flipped(new UpwardsLinkIO) +} + +class DecoupledPortIO extends Bundle { + val tx = new DecoupledDownwardsLinkIO + val rx = Flipped(new DecoupledUpwardsLinkIO) +} + +class DecoupledNoSnpPortIO extends Bundle { + val tx = new DecoupledDownwardsNoSnpLinkIO + val rx = Flipped(new DecoupledUpwardsNoSnpLinkIO) +} + +object LinkStates { + val width = 2 + + def STOP = 0.U(width.W) + def ACTIVATE = 1.U(width.W) + def RUN = 2.U(width.W) + def DEACTIVATE = 3.U(width.W) +} + +class LinkState extends Bundle { + val state = UInt(LinkStates.width.W) +} + +object LinkState { + def apply(s: UInt) = { + val ls = Wire(new LinkState) + ls.state := s + ls + } + def onReset = LinkState(LinkStates.STOP) +} + +class LCredit2Decoupled[T <: Bundle]( + gen: T, + lcreditNum: Int = 4 // the number of L-Credits that a receiver can provide +) extends Module { + val io = IO(new Bundle() { + val in = Flipped(ChannelIO(gen.cloneType)) + val out = DecoupledIO(gen.cloneType) + val state = Input(new LinkState()) + val reclaimLCredit = Output(Bool()) + }) + + require(lcreditNum <= 15) + + val queue = Module(new Queue(gen.cloneType, entries = lcreditNum, pipe = true, flow = false)) + + val state = io.state.state + val enableLCredit = state === LinkStates.RUN + + val lcreditsWidth = log2Up(lcreditNum) + 1 + val lcreditInflight = RegInit(0.U(lcreditsWidth.W)) + val lcreditPool = RegInit(lcreditNum.U(lcreditsWidth.W)) + assert(lcreditInflight + lcreditPool === lcreditNum.U) + val lcreditOut = (lcreditPool > queue.io.count) && enableLCredit + + val ready = lcreditInflight =/= 0.U + val accept = ready && io.in.flitv && RegNext(io.in.flitpend) + + when (lcreditOut) { + when (!accept) { + lcreditInflight := lcreditInflight + 1.U + lcreditPool := lcreditPool - 1.U + } + }.otherwise { + when (accept) { + lcreditInflight := lcreditInflight - 1.U + lcreditPool := lcreditPool + 1.U + } + } + + queue.io.enq.valid := accept + // queue.io.enq.bits := io.in.bits + var lsb = 0 + queue.io.enq.bits.getElements.reverse.foreach { case e => + e := io.in.flit(lsb + e.asUInt.getWidth - 1, lsb).asTypeOf(e.cloneType) + lsb += e.asUInt.getWidth + } + + assert(!accept || queue.io.enq.ready) + + io.in.lcrdv := lcreditOut + + io.out <> queue.io.deq + val opcodeElements = queue.io.deq.bits.elements.filter(_._1 == "opcode") + require (opcodeElements.size == 1) + for ((_, opcode) <- opcodeElements) { + when (queue.io.deq.valid && opcode === 0.U) { + // This is a *LCrdReturn flit + queue.io.deq.ready := true.B + io.out.valid := false.B + } + } + io.reclaimLCredit := lcreditInflight === 0.U +} + +object LCredit2Decoupled { + val defaultLCreditNum = 4 + + def apply[T <: Bundle]( + left: ChannelIO[T], + right: DecoupledIO[T], + state: LinkState, + reclaimLCredit: Bool, + suggestName: Option[String] = None, + lcreditNum: Int = defaultLCreditNum + ): Unit = { + val mod = Module(new LCredit2Decoupled(right.bits.cloneType, lcreditNum)) + suggestName.foreach(name => mod.suggestName(s"LCredit2Decoupled_${name}")) + + mod.io.in <> left + right <> mod.io.out + mod.io.state := state + reclaimLCredit := mod.io.reclaimLCredit + } +} + +class Decoupled2LCredit[T <: Bundle](gen: T) extends Module { + val io = IO(new Bundle() { + val in = Flipped(DecoupledIO(gen.cloneType)) + val out = ChannelIO(gen.cloneType) + val state = Input(new LinkState()) + }) + + val state = io.state.state + val disableFlit = state === LinkStates.STOP || state === LinkStates.ACTIVATE + val disableLCredit = state === LinkStates.STOP + val acceptLCredit = io.out.lcrdv && !disableLCredit + + // The maximum number of L-Credits that a receiver can provide is 15. + val lcreditsMax = 15 + val lcreditPool = RegInit(0.U(log2Up(lcreditsMax).W)) + + val returnLCreditValid = !io.in.valid && state === LinkStates.DEACTIVATE && lcreditPool =/= 0.U + + when (acceptLCredit) { + when (!io.out.flitv) { + lcreditPool := lcreditPool + 1.U + assert(lcreditPool + 1.U =/= 0.U, "L-Credit pool overflow") + } + }.otherwise { + when (io.out.flitv) { + lcreditPool := lcreditPool - 1.U + } + } + + io.in.ready := lcreditPool =/= 0.U && !disableFlit + io.out.flitpend := true.B + io.out.flitv := io.in.fire || returnLCreditValid + io.out.flit := Mux( + io.in.valid, + Cat(io.in.bits.getElements.map(_.asUInt)), + 0.U // LCrdReturn + ) +} + +object Decoupled2LCredit { + def apply[T <: Bundle]( + left: DecoupledIO[T], + right: ChannelIO[T], + state: LinkState, + suggestName: Option[String] = None + ): Unit = { + val mod = Module(new Decoupled2LCredit(left.bits.cloneType)) + suggestName.foreach(name => mod.suggestName(s"Decoupled2LCredit_${name}")) + + mod.io.in <> left + right <> mod.io.out + mod.io.state := state + } +} + +class LinkMonitor(implicit p: Parameters) extends L2Module with HasCHIMsgParameters { + val io = IO(new Bundle() { + val in = Flipped(new DecoupledPortIO()) + val out = new PortIO + val nodeID = Input(UInt(NODEID_WIDTH.W)) + }) + // val s_stop :: s_activate :: s_run :: s_deactivate :: Nil = Enum(4) + + val txState = RegInit(LinkStates.STOP) + val rxState = RegInit(LinkStates.STOP) + + Seq(txState, rxState).zip(MixedVecInit(Seq(io.out.tx, io.out.rx))).foreach { case (state, link) => + state := MuxLookup(Cat(link.linkactivereq, link.linkactiveack), LinkStates.STOP)(Seq( + Cat(true.B, false.B) -> LinkStates.ACTIVATE, + Cat(true.B, true.B) -> LinkStates.RUN, + Cat(false.B, true.B) -> LinkStates.DEACTIVATE, + Cat(false.B, false.B) -> LinkStates.STOP + )) + } + + /* IO assignment */ + val rxsnpDeact, rxrspDeact, rxdatDeact = Wire(Bool()) + val rxDeact = rxsnpDeact && rxrspDeact && rxdatDeact + Decoupled2LCredit(setSrcID(io.in.tx.req, io.nodeID), io.out.tx.req, LinkState(txState), Some("txreq")) + Decoupled2LCredit(setSrcID(io.in.tx.rsp, io.nodeID), io.out.tx.rsp, LinkState(txState), Some("txrsp")) + Decoupled2LCredit(setSrcID(io.in.tx.dat, io.nodeID), io.out.tx.dat, LinkState(txState), Some("txdat")) + LCredit2Decoupled(io.out.rx.snp, io.in.rx.snp, LinkState(rxState), rxsnpDeact, Some("rxsnp")) + LCredit2Decoupled(io.out.rx.rsp, io.in.rx.rsp, LinkState(rxState), rxrspDeact, Some("rxrsp")) + LCredit2Decoupled(io.out.rx.dat, io.in.rx.dat, LinkState(rxState), rxdatDeact, Some("rxdat")) + + io.out.txsactive := true.B + io.out.tx.linkactivereq := !reset.asBool + io.out.rx.linkactiveack := RegNext(io.out.rx.linkactivereq) || !rxDeact + + dontTouch(io.out) + + def setSrcID[T <: Bundle](in: DecoupledIO[T], srcID: UInt = 0.U): DecoupledIO[T] = { + val out = Wire(in.cloneType) + out <> in + out.bits.elements.filter(_._1 == "srcID").head._2 := srcID + out + } +} \ No newline at end of file diff --git a/src/main/scala/coupledL2/tl2chi/chi/Message.scala b/src/main/scala/coupledL2/tl2chi/chi/Message.scala new file mode 100644 index 00000000..57b3eb62 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/chi/Message.scala @@ -0,0 +1,264 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.cde.config.Parameters +import scala.math.max +import coupledL2.TaskBundle + +object CHICohStates { + val width = 3 + + def I = "b000".U(width.W) + def SC = "b001".U(width.W) + def UC = "b010".U(width.W) + def UD = "b010".U(width.W) + def SD = "b011".U(width.W) + + def PassDirty = "b100".U(width.W) + + def I_PD = setPD(I) + def SC_PD = setPD(SC) + def UC_PD = setPD(UC) + def UD_PD = setPD(UD) + def SD_PD = setPD(SD) + + def setPD(state: UInt, pd: Bool = true.B): UInt = { + require(state.getWidth == width) + state | Mux(pd, PassDirty, 0.U) + } +} + +object OrderEncodings { + val width = 2 + + def None = "b00".U(width.W) + def RequestAccepted = "b01".U(width.W) + def RequestOrder = "b10".U(width.W) + def OWO = "b10".U(width.W) // Ordered Write Observation + def EndpointOrder = "b11".U(width.W) + + def isRequestOrder(order: UInt): Bool = order >= RequestOrder +} + +class MemAttr extends Bundle { + // The Allocate attribute is a an allocation hint. + // It indicates the recommended allocation policy for a transaction. + val allocate = Bool() + // The Cacheable attribute indicates if a transaction must perform a cache lookup. + val cacheable = Bool() + // Device attribute indicates if the memory type is either Device or Normal. + val device = Bool() + // Early Write Acknowledge (EWA) + // EWA indicates whether the write completion response for a transaction: + // If true, comp is permitted to come from an intermediate point in the interconnect, such as a Home Node. + // If false, comp must come from the final endpoint that a transaction is destined for. + val ewa = Bool() +} + +object MemAttr extends HasCHIMsgParameters { + def apply(allocate: Bool, cacheable: Bool, device: Bool, ewa: Bool): MemAttr = { + val memAttr = Wire(new MemAttr) + memAttr.allocate := allocate + memAttr.cacheable := cacheable + memAttr.device := device + memAttr.ewa := ewa + memAttr + } + def apply(): MemAttr = apply(false.B, false.B, false.B, false.B) +} + +trait HasCHIMsgParameters { + // TODO: Comfirm the fields and their corresponding width + def NODEID_WIDTH = 7 + require(NODEID_WIDTH >= 7 && NODEID_WIDTH <= 11) + + // Transaction request fields + def QOS_WIDTH = 4 + def TGTID_WIDTH = NODEID_WIDTH + def SRCID_WIDTH = NODEID_WIDTH + def TXNID_WIDTH = 8 // An 8-bit field is defined for the TxnID to accommodate up to 256 outstanding transactions + def LPID_WIDTH = 5 // TODO: To be confirmed + def RETURNNID_WIDTH = NODEID_WIDTH + def RETURNTXNID_WIDTH = TXNID_WIDTH + def STASHNID_WIDTH = NODEID_WIDTH + def STASHLPID_WIDTH = LPID_WIDTH + // def STASHINFO_WIDTH = 2 //TODO + + + def REQ_OPCODE_WIDTH = CHIOpcode.REQOpcodes.width + def RSP_OPCODE_WIDTH = CHIOpcode.RSPOpcodes.width + def SNP_OPCODE_WIDTH = CHIOpcode.SNPOpcodes.width + def DAT_OPCODE_WIDTH = CHIOpcode.DATOpcodes.width + def OPCODE_WIDTH = max(REQ_OPCODE_WIDTH, max(RSP_OPCODE_WIDTH, max(SNP_OPCODE_WIDTH, DAT_OPCODE_WIDTH))) + + def ADDR_WIDTH = 48 // TODO: To be confirmed + def SNP_ADDR_WIDTH = ADDR_WIDTH - 3 + def SIZE_WIDTH = 3 + def PCRDTYPE_WIDTH = 4 + def MEMATTR_WIDTH = 4 + def ORDER_WIDTH = OrderEncodings.width + + // Snoop request fields + def FWDNID_WIDTH = NODEID_WIDTH + def FWDTXNID_WIDTH = TXNID_WIDTH + def VMIDEXT_WIDTH = 8 // TODO: To be confirmed + + // Data fields && Response fields + def HOMENID_WIDTH = NODEID_WIDTH + def DBID_WIDTH = TXNID_WIDTH + def RESPERR_WIDTH = 2 + def RESP_WIDTH = CHICohStates.width + def FWDSTATE_WIDTH = CHICohStates.width + def DATAPULL_WIDTH = 3 + def DATASOURCE_WIDTH = 3 + def CCID_WIDTH = 2 // TODO: To be confirmed + def DATAID_WIDTH = 2 // TODO: To be confirmed + def BE_WIDTH = DATA_WIDTH / 8 + def DATA_WIDTH = 256 + def DATACHECK_WIDTH = DATA_WIDTH / 8 + + // User defined + /* + * Currently don't care about *::RSVDC, and the width is tied to 4. + */ + def REQ_RSVDC_WIDTH = 4 // Permitted RSVDC bus widths X = 0, 4, 12, 16, 24, 32 + def DAT_RSVDC_WIDTH = 4 // Permitted RSVDC bus widths Y = 0, 4, 12, 16, 24, 32 +} + +abstract class CHIBundle extends Bundle with HasCHIMsgParameters + +class CHIREQ extends CHIBundle { + // BE CAUTIOUS with the order of the flit fields + + /* LSB */ + val qos = UInt(QOS_WIDTH.W) + val tgtID = UInt(TGTID_WIDTH.W) + val srcID = UInt(SRCID_WIDTH.W) + val txnID = UInt(TXNID_WIDTH.W) + + val returnNID = UInt(RETURNNID_WIDTH.W) // Used for DMT + def stashNID = returnNID // Used for Stash + + val stashNIDValid = Bool() // Used for Stash + def endian = stashNIDValid // Used for Atomic + + val returnTxnID = UInt(RETURNTXNID_WIDTH.W) + def stashLPID = returnTxnID(STASHLPID_WIDTH - 1, 0) + def stashLPIDValid = returnTxnID(STASHLPID_WIDTH).asBool + + val opcode = UInt(REQ_OPCODE_WIDTH.W) + val size = UInt(SIZE_WIDTH.W) + val addr = UInt(ADDR_WIDTH.W) + val ns = Bool() + val likelyshared = Bool() + val allowRetry = Bool() + val order = UInt(ORDER_WIDTH.W) + val pCrdType = UInt(PCRDTYPE_WIDTH.W) + val memAttr = new MemAttr() + val snpAttr = Bool() + val lpID = UInt(LPID_WIDTH.W) + + val snoopMe = Bool() // Used for Atomic + def excl = snoopMe // Used for Exclusive transactions + + val expCompAck = Bool() + val traceTag = Bool() + val rsvdc = UInt(REQ_RSVDC_WIDTH.W) + + /* MSB */ +} + +class CHISNP extends CHIBundle { + // BE CAUTIOUS with the order of the flit fields + + /* LSB */ + val qos = UInt(QOS_WIDTH.W) + val srcID = UInt(SRCID_WIDTH.W) + val txnID = UInt(TXNID_WIDTH.W) + val fwdNID = UInt(FWDNID_WIDTH.W) + + val fwdTxnID = UInt(FWDTXNID_WIDTH.W) + def stashLPID = fwdTxnID(STASHLPID_WIDTH - 1, 0) + def stashLPIDValid = fwdTxnID(STASHLPID_WIDTH).asBool + def vmIDExt = fwdTxnID + + val opcode = UInt(SNP_OPCODE_WIDTH.W) + val addr = UInt(SNP_ADDR_WIDTH.W) + val ns = Bool() + + val doNotGoToSD = Bool() + def doNotDataPull = doNotGoToSD + + val retToSrc = Bool() + val traceTag = Bool() + + /* MSB */ +} + +class CHIDAT extends CHIBundle { + // BE CAUTIOUS with the order of the flit fields + + /* LSB */ + val qos = UInt(QOS_WIDTH.W) + val tgtID = UInt(TGTID_WIDTH.W) + val srcID = UInt(SRCID_WIDTH.W) + val txnID = UInt(TXNID_WIDTH.W) + val homeNID = UInt(HOMENID_WIDTH.W) + val opcode = UInt(DAT_OPCODE_WIDTH.W) + val respErr = UInt(RESPERR_WIDTH.W) + val resp = UInt(RESP_WIDTH.W) + + val fwdState = UInt(FWDSTATE_WIDTH.W) // Used for DCT + def dataPull = fwdState // Used for Stash + def dataSource = fwdState // Indicates Data source in a response + + val dbID = UInt(DBID_WIDTH.W) + val ccID = UInt(CCID_WIDTH.W) + val dataID = UInt(DATAID_WIDTH.W) + val traceTag = Bool() + val rsvdc = UInt(DAT_RSVDC_WIDTH.W) + val be = UInt(BE_WIDTH.W) + val data = UInt(DATA_WIDTH.W) + // TODO: maybe Data Check and Poison + + /* MSB */ +} + +class CHIRSP extends CHIBundle { + // BE CAUTIOUS with the order of the flit fields + + /* LSB */ + val qos = UInt(QOS_WIDTH.W) + val tgtID = UInt(TGTID_WIDTH.W) + val srcID = UInt(SRCID_WIDTH.W) + val txnID = UInt(TXNID_WIDTH.W) + val opcode = UInt(RSP_OPCODE_WIDTH.W) + val respErr = UInt(RESPERR_WIDTH.W) + val resp = UInt(RESP_WIDTH.W) + + val fwdState = UInt(FWDSTATE_WIDTH.W) + def dataPull = fwdState + + val dbID = UInt(DBID_WIDTH.W) + val pCrdType = UInt(PCRDTYPE_WIDTH.W) + val traceTag = Bool() + /* MSB */ +} diff --git a/src/main/scala/coupledL2/tl2chi/chi/NetworkLayer.scala b/src/main/scala/coupledL2/tl2chi/chi/NetworkLayer.scala new file mode 100644 index 00000000..86ce10f6 --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/chi/NetworkLayer.scala @@ -0,0 +1,44 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ +import freechips.rocketchip.diplomacy.AddressSet +import utility.ParallelPriorityMux + +/** + * System Address Map + * + * Each Requester, that is, each RN and HN in the system, must have a System Address Map (SAM) + * to determine the target ID of a request. + */ +class SAM(sam: Seq[(AddressSet, Int)]) { + def check(x: UInt): Bool = Cat(sam.map(_._1.contains(x))).orR + + // def lookup(x: BigInt): Int = ParallelPriorityMux(sam.map(m => (m._1.contains(x), m._2))) + def lookup(x: UInt): UInt = { + assert(check(x)) + ParallelPriorityMux(sam.map(m => (m._1.contains(x), m._2.U))) + } +} + +object SAM { + def apply(sam: Seq[(AddressSet, Int)]) = new SAM(sam) + def apply(sam: (AddressSet, Int)) = new SAM(Seq(sam)) +} \ No newline at end of file diff --git a/src/main/scala/coupledL2/tl2chi/chi/Opcode.scala b/src/main/scala/coupledL2/tl2chi/chi/Opcode.scala new file mode 100644 index 00000000..4c67e00b --- /dev/null +++ b/src/main/scala/coupledL2/tl2chi/chi/Opcode.scala @@ -0,0 +1,228 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2chi + +import chisel3._ +import chisel3.util._ + +object CHIOpcode { + + object REQOpcodes { + val width = 6 + + def ReqLCrdReturn = 0x00.U(width.W) + def ReadShared = 0x01.U(width.W) + def ReadClean = 0x02.U(width.W) + def ReadOnce = 0x03.U(width.W) + def ReadNoSnp = 0x04.U(width.W) + def PCrdReturn = 0x05.U(width.W) + + def ReadUnique = 0x07.U(width.W) + def CleanShared = 0x08.U(width.W) + def CleanInvalid = 0x09.U(width.W) + def MakeInvalid = 0x0A.U(width.W) + def CleanUnique = 0x0B.U(width.W) + def MakeUnique = 0x0C.U(width.W) + def Evict = 0x0D.U(width.W) + + def DVMOp = 0x14.U(width.W) + def WriteEvictFull = 0x15.U(width.W) + + def WriteCleanFull = 0x17.U(width.W) + def WriteUniquePtl = 0x18.U(width.W) + def WriteUniqueFull = 0x19.U(width.W) + def WriteBackPtl = 0x1A.U(width.W) + def WriteBackFull = 0x1B.U(width.W) + def WriteNoSnpPtl = 0x1C.U(width.W) + def WriteNoSnpFull = 0x1D.U(width.W) + + def WriteUniqueFullStash = 0x20.U(width.W) + def WriteUniquePtlStash = 0x21.U(width.W) + def StashOnceShared = 0x22.U(width.W) + def StashOnceUnique = 0x23.U(width.W) + def ReadOnceCleanInvalid = 0x24.U(width.W) + def ReadOnceMakeInvalid = 0x25.U(width.W) + def ReadNotSharedDirty = 0x26.U(width.W) + def CleanSharedPersist = 0x27.U(width.W) + + def AtomicStore_ADD = 0x28.U(width.W) + def AtomicStore_CLR = 0x29.U(width.W) + def AtomicStore_EOR = 0x2A.U(width.W) + def AtomicStore_SET = 0x2B.U(width.W) + def AtomicStore_SMAX = 0x2C.U(width.W) + def AtomicStore_SMIN = 0x2D.U(width.W) + def AtomicStore_UMAX = 0x2E.U(width.W) + def AtomicStore_UMIN = 0x2F.U(width.W) + def AtomicLoad_ADD = 0x30.U(width.W) + def AtomicLoad_CLR = 0x31.U(width.W) + def AtomicLoad_EOR = 0x32.U(width.W) + def AtomicLoad_SET = 0x33.U(width.W) + def AtomicLoad_SMAX = 0x34.U(width.W) + def AtomicLoad_SMIN = 0x35.U(width.W) + def AtomicLoad_UMAX = 0x36.U(width.W) + def AtomicLoad_UMIN = 0x37.U(width.W) + def AtomicSwap = 0x38.U(width.W) + def AtomicCompare = 0x39.U(width.W) + def PrefetchTgt = 0x3A.U(width.W) + } + + object RSPOpcodes { + val width = 4 + + def RespLCrdReturn = 0x0.U(width.W) + def SnpResp = 0x1.U(width.W) + def CompAck = 0x2.U(width.W) + def RetryAck = 0x3.U(width.W) + def Comp = 0x4.U(width.W) + def CompDBIDResp = 0x5.U(width.W) + def DBIDResp = 0x6.U(width.W) + def PCrdGrant = 0x7.U(width.W) + def ReadReceipt = 0x8.U(width.W) + def SnpRespFwded = 0x9.U(width.W) + } + + object SNPOpcodes { + val width = 5 + + def SnpLCrdReturn = 0x00.U(width.W) + def SnpShared = 0x01.U(width.W) + def SnpClean = 0x02.U(width.W) + def SnpOnce = 0x03.U(width.W) + def SnpNotSharedDirty = 0x04.U(width.W) + def SnpUniqueStash = 0x05.U(width.W) + def SnpMakeInvalidStash = 0x06.U(width.W) + def SnpUnique = 0x07.U(width.W) + def SnpCleanShared = 0x08.U(width.W) + def SnpCleanInvalid = 0x09.U(width.W) + def SnpMakeInvalid = 0x0A.U(width.W) + def SnpStashUnique = 0x0B.U(width.W) + def SnpStashShared = 0x0C.U(width.W) + def SnpDVMOp = 0x0D.U(width.W) + + def SnpSharedFwd = 0x11.U(width.W) + def SnpCleanFwd = 0x12.U(width.W) + def SnpOnceFwd = 0x13.U(width.W) + def SnpNotSharedDirtyFwd = 0x14.U(width.W) + + def SnpUniqueFwd = 0x17.U(width.W) + + def widthCheck(opcode: UInt): Unit = { require (opcode.getWidth >= width) } + + def isSnpXStash(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpUniqueStash || opcode === SnpMakeInvalidStash + } + + def isSnpStashX(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpStashUnique || opcode === SnpStashShared + } + + def isSnpXFwd(opcode: UInt): Bool = { + widthCheck(opcode) + opcode >= SnpSharedFwd + } + + + def isSnpOnceX(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpOnce || opcode === SnpOnceFwd + } + + def isSnpCleanX(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpClean || opcode === SnpCleanFwd + } + + def isSnpSharedX(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpShared || opcode === SnpSharedFwd + } + + def isSnpNotSharedDirtyX(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpNotSharedDirty || opcode === SnpNotSharedDirtyFwd + } + + def isSnpToB(opcode: UInt): Bool = { + isSnpCleanX(opcode) || isSnpSharedX(opcode) || isSnpNotSharedDirtyX(opcode) + } + + def isSnpToN(opcode: UInt): Bool = { + isSnpUniqueX(opcode) || opcode === SnpCleanInvalid || isSnpMakeInvalidX (opcode) + } + + def isSnpCleanShared(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpCleanShared + } + + def isSnpToBNonFwd(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpClean || + opcode === SnpNotSharedDirty || + opcode === SnpShared + } + + def isSnpToBFwd(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpCleanFwd || + opcode === SnpNotSharedDirtyFwd || + opcode === SnpSharedFwd + } + + def isSnpToNNonFwd(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpUnique || opcode === SnpUniqueStash + } + + def isSnpToNFwd(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpUniqueFwd + } + + def isSnpUniqueX(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpUnique || opcode === SnpUniqueFwd || opcode === SnpUniqueStash + } + + def isSnpMakeInvalidX(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpMakeInvalid || opcode === SnpMakeInvalidStash + } + } + + object DATOpcodes { + val width = 3 + + def DataLCrdReturn = 0x0.U(width.W) + def SnpRespData = 0x1.U(width.W) + def CopyBackWrData = 0x2.U(width.W) + def NonCopyBackWrData = 0x3.U(width.W) + def CompData = 0x4.U(width.W) + def SnpRespDataPtl = 0x5.U(width.W) + def SnpRespDataFwded = 0x6.U(width.W) + def WriteDataCancel = 0x7.U(width.W) + + def widthCheck(opcode: UInt): Unit = { require (opcode.getWidth >= width) } + def isSnpRespDataX(opcode: UInt): Bool = { + widthCheck(opcode) + opcode === SnpRespData || opcode === SnpRespDataPtl || opcode === SnpRespDataFwded + } + } +} diff --git a/src/main/scala/coupledL2/AcquireUnit.scala b/src/main/scala/coupledL2/tl2tl/AcquireUnit.scala similarity index 97% rename from src/main/scala/coupledL2/AcquireUnit.scala rename to src/main/scala/coupledL2/tl2tl/AcquireUnit.scala index c391c003..6726c9be 100644 --- a/src/main/scala/coupledL2/AcquireUnit.scala +++ b/src/main/scala/coupledL2/tl2tl/AcquireUnit.scala @@ -15,7 +15,7 @@ * ************************************************************************************* */ -package coupledL2 +package coupledL2.tl2tl import chisel3._ import chisel3.util._ @@ -24,6 +24,7 @@ import freechips.rocketchip.tilelink._ import freechips.rocketchip.tilelink.TLMessages._ import org.chipsalliance.cde.config.Parameters import huancun.{DirtyKey, PreferCacheKey} +import coupledL2._ class AcquireUnit(implicit p: Parameters) extends L2Module { val io = IO(new Bundle() { diff --git a/src/main/scala/coupledL2/tl2tl/Bundle.scala b/src/main/scala/coupledL2/tl2tl/Bundle.scala new file mode 100644 index 00000000..203a3c89 --- /dev/null +++ b/src/main/scala/coupledL2/tl2tl/Bundle.scala @@ -0,0 +1,53 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2tl + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.cde.config.Parameters +import freechips.rocketchip.tilelink.TLPermissions._ +import utility.MemReqSource +import coupledL2._ + +// MSHR exposes signals to MSHRCtl +class MSHRStatus(implicit p: Parameters) extends L2Bundle with HasTLChannelBits { + val set = UInt(setBits.W) + val reqTag = UInt(tagBits.W) + val metaTag = UInt(tagBits.W) + val needsRepl = Bool() + val w_c_resp = Bool() + val w_d_resp = Bool() + val will_free = Bool() + + // val way = UInt(wayBits.W) +// val off = UInt(offsetBits.W) +// val opcode = UInt(3.W) +// val param = UInt(3.W) +// val size = UInt(msgSizeBits.W) +// val source = UInt(sourceIdBits.W) +// val alias = aliasBitsOpt.map(_ => UInt(aliasBitsOpt.get.W)) +// val aliasTask = aliasBitsOpt.map(_ => Bool()) +// val needProbeAckData = Bool() // only for B reqs +// val fromL2pft = prefetchOpt.map(_ => Bool()) +// val needHint = prefetchOpt.map(_ => Bool()) + + // for TopDown usage + val reqSource = UInt(MemReqSource.reqSourceBits.W) + val is_miss = Bool() + val is_prefetch = Bool() +} diff --git a/src/main/scala/coupledL2/MSHR.scala b/src/main/scala/coupledL2/tl2tl/MSHR.scala similarity index 97% rename from src/main/scala/coupledL2/MSHR.scala rename to src/main/scala/coupledL2/tl2tl/MSHR.scala index 46e287b8..465b7b9e 100644 --- a/src/main/scala/coupledL2/MSHR.scala +++ b/src/main/scala/coupledL2/tl2tl/MSHR.scala @@ -15,7 +15,7 @@ * ************************************************************************************* */ -package coupledL2 +package coupledL2.tl2tl import chisel3._ import chisel3.util._ @@ -25,6 +25,7 @@ import freechips.rocketchip.tilelink._ import freechips.rocketchip.tilelink.TLMessages._ import freechips.rocketchip.tilelink.TLPermissions._ import org.chipsalliance.cde.config.Parameters +import coupledL2._ import coupledL2.prefetch.{PfSource, PrefetchTrain} import coupledL2.utils.XSPerfAccumulate @@ -160,6 +161,7 @@ class MSHR(implicit p: Parameters) extends L2Module { } val mp_release, mp_probeack, mp_grant = Wire(new TaskBundle) val mp_release_task = { + mp_release := 0.U.asTypeOf(new TaskBundle) mp_release.channel := req.channel mp_release.tag := dirResult.tag mp_release.set := req.set @@ -203,10 +205,12 @@ class MSHR(implicit p: Parameters) extends L2Module { mp_release.reqSource := 0.U(MemReqSource.reqSourceBits.W) mp_release.mergeA := false.B mp_release.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle) + mp_release.txChannel := 0.U mp_release } val mp_probeack_task = { + mp_probeack := 0.U.asTypeOf(new TaskBundle) mp_probeack.channel := req.channel mp_probeack.tag := req.tag mp_probeack.set := req.set @@ -264,6 +268,7 @@ class MSHR(implicit p: Parameters) extends L2Module { mp_probeack.replTask := false.B mp_probeack.mergeA := false.B mp_probeack.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle) + mp_probeack.txChannel := 0.U mp_probeack } @@ -275,6 +280,7 @@ class MSHR(implicit p: Parameters) extends L2Module { mergeA := false.B } val mp_grant_task = { + mp_grant := 0.U.asTypeOf(new TaskBundle) mp_grant.channel := req.channel mp_grant.tag := req.tag mp_grant.set := req.set @@ -386,6 +392,7 @@ class MSHR(implicit p: Parameters) extends L2Module { prefetch = false.B, accessed = true.B ) + mp_grant.txChannel := 0.U mp_grant } @@ -513,9 +520,6 @@ class MSHR(implicit p: Parameters) extends L2Module { timer := 0.U } - // when grant not received, B can nest A - val nestB = !state.w_grantfirst - // alias: should protect meta from being accessed or occupied val releaseNotSent = !state.s_release io.status.valid := req_valid @@ -536,19 +540,25 @@ class MSHR(implicit p: Parameters) extends L2Module { io.msInfo.bits.set := req.set io.msInfo.bits.way := dirResult.way io.msInfo.bits.reqTag := req.tag + io.msInfo.bits.aliasTask.foreach(_ := req.aliasTask.getOrElse(false.B)) io.msInfo.bits.needRelease := !state.w_releaseack // if releaseTask is already in mainpipe_s1/s2, while a refillTask in mainpipe_s3, the refill should also be blocked and retry io.msInfo.bits.blockRefill := releaseNotSent || RegNext(releaseNotSent,false.B) || RegNext(RegNext(releaseNotSent,false.B),false.B) io.msInfo.bits.dirHit := dirResult.hit io.msInfo.bits.metaTag := dirResult.tag io.msInfo.bits.willFree := will_free - io.msInfo.bits.nestB := nestB io.msInfo.bits.isAcqOrPrefetch := req_acquire || req_prefetch io.msInfo.bits.isPrefetch := req_prefetch - io.msInfo.bits.s_refill := state.s_refill io.msInfo.bits.param := req.param io.msInfo.bits.mergeA := mergeA + io.msInfo.bits.w_grantfirst := state.w_grantfirst + io.msInfo.bits.s_refill := state.s_refill io.msInfo.bits.w_releaseack := state.w_releaseack + io.msInfo.bits.w_replResp := state.w_replResp + io.msInfo.bits.w_rprobeacklast := state.w_rprobeacklast + io.msInfo.bits.replaceData := mp_release.opcode === ReleaseData + io.msInfo.bits.metaState := meta.state + io.msInfo.bits.channel := req.channel assert(!(c_resp.valid && !io.status.bits.w_c_resp)) assert(!(d_resp.valid && !io.status.bits.w_d_resp)) diff --git a/src/main/scala/coupledL2/MSHRCtl.scala b/src/main/scala/coupledL2/tl2tl/MSHRCtl.scala similarity index 99% rename from src/main/scala/coupledL2/MSHRCtl.scala rename to src/main/scala/coupledL2/tl2tl/MSHRCtl.scala index d1dfd57d..6cf990fb 100644 --- a/src/main/scala/coupledL2/MSHRCtl.scala +++ b/src/main/scala/coupledL2/tl2tl/MSHRCtl.scala @@ -15,7 +15,7 @@ * ************************************************************************************* */ -package coupledL2 +package coupledL2.tl2tl import chisel3._ import chisel3.util._ @@ -23,6 +23,7 @@ import utility._ import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.tilelink._ import freechips.rocketchip.tilelink.TLMessages._ +import coupledL2._ import coupledL2.prefetch.PrefetchTrain import coupledL2.utils.{XSPerfAccumulate, XSPerfHistogram, XSPerfMax} diff --git a/src/main/scala/coupledL2/MainPipe.scala b/src/main/scala/coupledL2/tl2tl/MainPipe.scala similarity index 99% rename from src/main/scala/coupledL2/MainPipe.scala rename to src/main/scala/coupledL2/tl2tl/MainPipe.scala index 309a4109..5e6fb9fc 100644 --- a/src/main/scala/coupledL2/MainPipe.scala +++ b/src/main/scala/coupledL2/tl2tl/MainPipe.scala @@ -15,7 +15,7 @@ * ************************************************************************************* */ -package coupledL2 +package coupledL2.tl2tl import chisel3._ import chisel3.util._ @@ -25,6 +25,7 @@ import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.tilelink._ import freechips.rocketchip.tilelink.TLMessages._ import freechips.rocketchip.tilelink.TLPermissions._ +import coupledL2._ import coupledL2.utils._ import coupledL2.debug._ import coupledL2.prefetch.{PfSource, PrefetchTrain} @@ -225,6 +226,10 @@ class MainPipe(implicit p: Parameters) extends L2Module { ms_task.reqSource := req_s3.reqSource ms_task.mergeA := req_s3.mergeA ms_task.aMergeTask := req_s3.aMergeTask + ms_task.txChannel := 0.U + ms_task.snpHitRelease := false.B + ms_task.snpHitReleaseWithData := false.B + ms_task.snpHitReleaseIdx := 0.U /* ======== Resps to SinkA/B/C Reqs ======== */ val sink_resp_s3 = WireInit(0.U.asTypeOf(Valid(new TaskBundle))) // resp for sinkA/B/C request that does not need to alloc mshr @@ -400,6 +405,7 @@ class MainPipe(implicit p: Parameters) extends L2Module { // This serves as VALID signal // c_set_dirty is true iff Release has Data io.nestedwb.c_set_dirty := task_s3.valid && task_s3.bits.fromC && task_s3.bits.opcode === ReleaseData + io.nestedwb.b_inv_dirty := false.B io.nestedwbData := c_releaseData_s3.asTypeOf(new DSBlock) @@ -502,6 +508,7 @@ class MainPipe(implicit p: Parameters) extends L2Module { io.releaseBufWrite.valid := task_s5.valid && need_write_releaseBuf_s5 io.releaseBufWrite.bits.id := task_s5.bits.mshrId io.releaseBufWrite.bits.data.data := rdata_s5 + io.releaseBufWrite.bits.beatMask := Fill(beatSize, true.B) val c_d_valid_s5 = task_s5.valid && !RegNext(chnl_fire_s4, false.B) && !RegNextN(chnl_fire_s3, 2, Some(false.B)) c_s5.valid := c_d_valid_s5 && isC_s5 diff --git a/src/main/scala/coupledL2/ProbeQueue.scala b/src/main/scala/coupledL2/tl2tl/ProbeQueue.scala similarity index 98% rename from src/main/scala/coupledL2/ProbeQueue.scala rename to src/main/scala/coupledL2/tl2tl/ProbeQueue.scala index 4cce2d4a..e8dce881 100644 --- a/src/main/scala/coupledL2/ProbeQueue.scala +++ b/src/main/scala/coupledL2/tl2tl/ProbeQueue.scala @@ -15,10 +15,11 @@ * ************************************************************************************* */ -package coupledL2 +package coupledL2.tl2tl import chisel3._ import chisel3.util._ +import coupledL2._ import coupledL2.utils._ import freechips.rocketchip.tilelink._ import org.chipsalliance.cde.config.Parameters diff --git a/src/main/scala/coupledL2/RefillUnit.scala b/src/main/scala/coupledL2/tl2tl/RefillUnit.scala similarity index 97% rename from src/main/scala/coupledL2/RefillUnit.scala rename to src/main/scala/coupledL2/tl2tl/RefillUnit.scala index 3aeb1cda..4e6d6b32 100644 --- a/src/main/scala/coupledL2/RefillUnit.scala +++ b/src/main/scala/coupledL2/tl2tl/RefillUnit.scala @@ -15,13 +15,14 @@ * ************************************************************************************* */ -package coupledL2 +package coupledL2.tl2tl import chisel3._ import chisel3.util._ import freechips.rocketchip.tilelink._ import freechips.rocketchip.tilelink.TLMessages._ import org.chipsalliance.cde.config.Parameters +import coupledL2._ import coupledL2.utils.XSPerfAccumulate import huancun.{DirtyKey, IsHitKey} @@ -60,6 +61,7 @@ class RefillUnit(implicit p: Parameters) extends L2Module { io.refillBufWrite.valid := io.sinkD.valid && hasData && last io.refillBufWrite.bits.id := io.sinkD.bits.source io.refillBufWrite.bits.data.data := Cat(io.sinkD.bits.data, grantDataBuf) + io.refillBufWrite.bits.beatMask := Fill(beatSize, true.B) io.resp.valid := (first || last) && io.sinkD.valid io.resp.mshrId := io.sinkD.bits.source diff --git a/src/main/scala/coupledL2/SinkB.scala b/src/main/scala/coupledL2/tl2tl/SinkB.scala similarity index 93% rename from src/main/scala/coupledL2/SinkB.scala rename to src/main/scala/coupledL2/tl2tl/SinkB.scala index d68b6778..5c23d7d9 100644 --- a/src/main/scala/coupledL2/SinkB.scala +++ b/src/main/scala/coupledL2/tl2tl/SinkB.scala @@ -15,7 +15,7 @@ * ************************************************************************************* */ -package coupledL2 +package coupledL2.tl2tl import chisel3._ import chisel3.util._ @@ -23,6 +23,7 @@ import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.tilelink._ import freechips.rocketchip.tilelink.TLMessages._ import freechips.rocketchip.tilelink.TLPermissions._ +import coupledL2._ import coupledL2.utils.XSPerfAccumulate import utility.MemReqSource @@ -36,6 +37,7 @@ class SinkB(implicit p: Parameters) extends L2Module { def fromTLBtoTaskBundle(b: TLBundleB): TaskBundle = { val task = Wire(new TaskBundle) task.channel := "b010".U + task.txChannel := 0.U task.tag := parseAddress(b.address)._1 task.set := parseAddress(b.address)._2 task.off := parseAddress(b.address)._3 @@ -66,13 +68,16 @@ class SinkB(implicit p: Parameters) extends L2Module { task.replTask := false.B task.mergeA := false.B task.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle) + task.snpHitRelease := false.B + task.snpHitReleaseWithData := false.B + task.snpHitReleaseIdx := 0.U task } val task = fromTLBtoTaskBundle(io.b.bits) // unable to accept incoming B req because same-addr as some MSHR REQ val addrConflict = VecInit(io.msInfo.map(s => - s.valid && s.bits.set === task.set && s.bits.reqTag === task.tag && !s.bits.willFree && !s.bits.nestB + s.valid && s.bits.set === task.set && s.bits.reqTag === task.tag && !s.bits.willFree && s.bits.w_grantfirst )).asUInt.orR // unable to accept incoming B req because same-addr Release to L3 and have not received ReleaseAck, and some MSHR replaced block and cannot nest diff --git a/src/main/scala/coupledL2/Slice.scala b/src/main/scala/coupledL2/tl2tl/Slice.scala similarity index 96% rename from src/main/scala/coupledL2/Slice.scala rename to src/main/scala/coupledL2/tl2tl/Slice.scala index 96f79af6..1d97ecd0 100644 --- a/src/main/scala/coupledL2/Slice.scala +++ b/src/main/scala/coupledL2/tl2tl/Slice.scala @@ -15,7 +15,7 @@ * ************************************************************************************* */ -package coupledL2 +package coupledL2.tl2tl import chisel3._ import chisel3.util._ @@ -23,6 +23,7 @@ import freechips.rocketchip.tilelink._ import freechips.rocketchip.tilelink.TLMessages._ import freechips.rocketchip.util.leftOR import org.chipsalliance.cde.config.Parameters +import coupledL2._ import coupledL2.utils._ import coupledL2.debug._ import coupledL2.prefetch.PrefetchIO @@ -79,7 +80,8 @@ class Slice()(implicit p: Parameters) extends L2Module { reqArb.io.fromMSHRCtl := mshrCtl.io.toReqArb reqArb.io.fromMainPipe := mainPipe.io.toReqArb reqArb.io.fromGrantBuffer := grantBuf.io.toReqArb - reqArb.io.fromSourceC := sourceC.io.toReqArb + reqArb.io.fromSourceC.foreach(_ := sourceC.io.toReqArb) + reqArb.io.msInfo := mshrCtl.io.msInfo mshrCtl.io.fromReqArb.status_s1 := reqArb.io.status_s1 mshrCtl.io.resps.sinkC := sinkC.io.resp @@ -114,6 +116,7 @@ class Slice()(implicit p: Parameters) extends L2Module { releaseBuf.io.w(0).valid := mshrCtl.io.nestedwbDataId.valid releaseBuf.io.w(0).bits.data := mainPipe.io.nestedwbData releaseBuf.io.w(0).bits.id := mshrCtl.io.nestedwbDataId.bits + releaseBuf.io.w(0).bits.beatMask := Fill(beatSize, true.B) releaseBuf.io.w(1) <> sinkC.io.releaseBufWrite releaseBuf.io.w(1).bits.id := mshrCtl.io.releaseBufWriteId releaseBuf.io.w(2) <> mainPipe.io.releaseBufWrite @@ -133,7 +136,7 @@ class Slice()(implicit p: Parameters) extends L2Module { grantBuf.io.d_task <> mainPipe.io.toSourceD grantBuf.io.fromReqArb.status_s1 := reqArb.io.status_s1 grantBuf.io.pipeStatusVec := reqArb.io.status_vec ++ mainPipe.io.status_vec_toD - mshrCtl.io.pipeStatusVec(0) := reqArb.io.status_vec(1) // s2 status + mshrCtl.io.pipeStatusVec(0) := (reqArb.io.status_vec)(1) // s2 status mshrCtl.io.pipeStatusVec(1) := mainPipe.io.status_vec_toD(0) // s3 status io.prefetch.foreach { diff --git a/src/main/scala/coupledL2/SourceC.scala b/src/main/scala/coupledL2/tl2tl/SourceC.scala similarity index 96% rename from src/main/scala/coupledL2/SourceC.scala rename to src/main/scala/coupledL2/tl2tl/SourceC.scala index 25e04323..5251b0da 100644 --- a/src/main/scala/coupledL2/SourceC.scala +++ b/src/main/scala/coupledL2/tl2tl/SourceC.scala @@ -15,13 +15,14 @@ * ************************************************************************************* */ -package coupledL2 +package coupledL2.tl2tl import chisel3._ import chisel3.util._ import utility._ import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.tilelink._ +import coupledL2._ import coupledL2.utils.XSPerfAccumulate import huancun.DirtyKey @@ -116,6 +117,13 @@ import huancun.DirtyKey // XSPerfAccumulate(cacheParams, "sourceC_full", full) //} +class SourceCBlockBundle(implicit p: Parameters) extends L2Bundle { + val blockSinkBReqEntrance = Bool() + val blockMSHRReqEntrance = Bool() + + def apply() = 0.U.asTypeOf(this) +} + class SourceC(implicit p: Parameters) extends L2Module { val io = IO(new Bundle() { val in = Flipped(DecoupledIO(new Bundle() { @@ -125,10 +133,7 @@ class SourceC(implicit p: Parameters) extends L2Module { val out = DecoupledIO(new TLBundleC(edgeOut.bundle)) val resp = Output(new RespBundle) val pipeStatusVec = Flipped(Vec(5, ValidIO(new PipeStatus))) - val toReqArb = Output(new Bundle() { - val blockSinkBReqEntrance = Bool() - val blockMSHRReqEntrance = Bool() - }) + val toReqArb = Output(new SourceCBlockBundle) }) // We must keep SourceC FIFO, so a queue is used diff --git a/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala b/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala new file mode 100644 index 00000000..e53f3b92 --- /dev/null +++ b/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala @@ -0,0 +1,339 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2.tl2tl + +import chisel3._ +import chisel3.util._ +import utility.{FastArbiter, Pipeline, ParallelPriorityMux, RegNextN} +import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.tilelink.TLMessages._ +import freechips.rocketchip.util._ +import org.chipsalliance.cde.config.{Parameters, Field} +import scala.math.max +import coupledL2._ +import coupledL2.prefetch._ +import coupledL2.utils.XSPerfAccumulate +import huancun.{TPmetaReq, TPmetaResp} + +class TL2TLCoupledL2(implicit p: Parameters) extends CoupledL2Base { + + val managerPortParams = (m: TLSlavePortParameters) => TLSlavePortParameters.v1( + m.managers.map { m => + m.v2copy( + regionType = if (m.regionType >= RegionType.UNCACHED) RegionType.CACHED else m.regionType, + supports = TLMasterToSlaveTransferSizes( + acquireB = xfer, + acquireT = if (m.supportsAcquireT) xfer else TransferSizes.none, + arithmetic = if (m.supportsAcquireT) atom else TransferSizes.none, + logical = if (m.supportsAcquireT) atom else TransferSizes.none, + get = access, + putFull = if (m.supportsAcquireT) access else TransferSizes.none, + putPartial = if (m.supportsAcquireT) access else TransferSizes.none, + hint = access + ), + fifoId = None + ) + }, + beatBytes = 32, + minLatency = 2, + responseFields = cacheParams.respField, + requestKeys = cacheParams.reqKey, + endSinkId = idsAll + ) + + val clientPortParams = (m: TLMasterPortParameters) => TLMasterPortParameters.v2( + Seq( + TLMasterParameters.v2( + name = cacheParams.name, + supports = TLSlaveToMasterTransferSizes( + probe = xfer + ), + sourceId = IdRange(0, idsAll) + ) + ), + channelBytes = cacheParams.channelBytes, + minLatency = 1, + echoFields = cacheParams.echoField, + requestFields = cacheParams.reqField, + responseKeys = cacheParams.respKey + ) + + val node = TLAdapterNode( + clientFn = clientPortParams, + managerFn = managerPortParams + ) + + val tpmeta_source_node = prefetchOpt match { + case Some(param: PrefetchReceiverParams) => + if (param.hasTPPrefetcher) Some(BundleBridgeSource(() => DecoupledIO(new TPmetaReq))) else None + case _ => None + } + val tpmeta_sink_node = prefetchOpt match { + case Some(param: PrefetchReceiverParams) => + if (param.hasTPPrefetcher) Some(BundleBridgeSink(Some(() => ValidIO(new TPmetaResp)))) else None + case _ => None + } + + class CoupledL2Imp(wrapper: LazyModule) extends LazyModuleImp(wrapper) { + + val banks = node.in.size + val bankBits = if (banks == 1) 0 else log2Up(banks) + val l2TlbParams: Parameters = p.alterPartial { + case EdgeInKey => node.in.head._2 + case EdgeOutKey => node.out.head._2 + case BankBitsKey => bankBits + } + val io = IO(new Bundle { + val hartId = Input(UInt(hartIdLen.W)) + // val l2_hint = Valid(UInt(32.W)) + val l2_hint = ValidIO(new L2ToL1Hint()) + val l2_tlb_req = new L2ToL1TlbIO(nRespDups = 1)(l2TlbParams) + val debugTopDown = new Bundle { + val robTrueCommit = Input(UInt(64.W)) + val robHeadPaddr = Flipped(Valid(UInt(36.W))) + val l2MissMatch = Output(Bool()) + } + }) + + // Display info + val sizeBytes = cacheParams.toCacheParams.capacity.toDouble + val sizeStr = sizeBytesToStr(sizeBytes) + val prefetch = "prefetch: " + cacheParams.prefetch + println(s"====== Inclusive TL-TL ${cacheParams.name} ($sizeStr * $banks-bank) $prefetch ======") + println(s"bankBits: ${bankBits}") + println(s"replacement: ${cacheParams.replacement}") + println(s"replace policy: ${cacheParams.releaseData}") + println(s"sets:${cacheParams.sets} ways:${cacheParams.ways} blockBytes:${cacheParams.blockBytes}") + print_bundle_fields(node.in.head._2.bundle.requestFields, "usr") + print_bundle_fields(node.in.head._2.bundle.echoFields, "echo") + + node.edges.in.headOption.foreach { n => + n.client.clients.zipWithIndex.foreach { + case (c, i) => + println(s"\t${i} <= ${c.name};" + + s"\tsourceRange: ${c.sourceId.start}~${c.sourceId.end}") + } + } + + // connection between prefetcher and the slices + val pftParams: Parameters = p.alterPartial { + case EdgeInKey => node.in.head._2 + case EdgeOutKey => node.out.head._2 + case BankBitsKey => bankBits + } + val prefetcher = prefetchOpt.map(_ => Module(new Prefetcher()(pftParams))) + val prefetchTrains = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchTrain()(pftParams))))) + val prefetchResps = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchResp()(pftParams))))) + val prefetchReqsReady = WireInit(VecInit(Seq.fill(banks)(false.B))) + io.l2_tlb_req <> DontCare // TODO: l2_tlb_req should be Option + prefetchOpt.foreach { + _ => + fastArb(prefetchTrains.get, prefetcher.get.io.train, Some("prefetch_train")) + prefetcher.get.io.req.ready := Cat(prefetchReqsReady).orR + prefetcher.get.hartId := io.hartId + fastArb(prefetchResps.get, prefetcher.get.io.resp, Some("prefetch_resp")) + prefetcher.get.io.tlb_req <> io.l2_tlb_req + } + pf_recv_node match { + case Some(x) => + prefetcher.get.io.recv_addr.valid := x.in.head._1.addr_valid + prefetcher.get.io.recv_addr.bits.addr := x.in.head._1.addr + prefetcher.get.io.recv_addr.bits.pfSource := x.in.head._1.pf_source + prefetcher.get.io_l2_pf_en := x.in.head._1.l2_pf_en + case None => + prefetcher.foreach{ + p => + p.io.recv_addr := 0.U.asTypeOf(p.io.recv_addr) + p.io_l2_pf_en := false.B + } + } + + tpmeta_source_node match { + case Some(x) => + x.out.head._1 <> prefetcher.get.tpio.tpmeta_port.get.req + case None => + } + tpmeta_sink_node match { + case Some(x) => + prefetcher.get.tpio.tpmeta_port.get.resp <> x.in.head._1 + case None => + } + + def bank_eq(set: UInt, bankId: Int, bankBits: Int): Bool = { + if(bankBits == 0) true.B else set(bankBits - 1, 0) === bankId.U + } + + // ** WARNING:TODO: this depends on where the latch is + // ** if Hint latched in slice, while D-Channel latched in XSTile + // ** we need only [hintCycleAhead - 1] later + val sliceAhead = hintCycleAhead - 1 + + val hintChosen = Wire(UInt(banks.W)) + val hintFire = Wire(Bool()) + + // if Hint indicates that this slice should fireD, yet no D resp comes out of this slice + // then we releaseSourceD, enabling io.d.ready for other slices + // TODO: if Hint for single slice is 100% accurate, may consider remove this + val releaseSourceD = Wire(Vec(node.in.size, Bool())) + val allCanFire = (RegNextN(!hintFire, sliceAhead) && RegNextN(!hintFire, sliceAhead + 1)) || Cat(releaseSourceD).orR + + val slices = node.in.zip(node.out).zipWithIndex.map { + case (((in, edgeIn), (out, edgeOut)), i) => + require(in.params.dataBits == out.params.dataBits) + val rst_L2 = reset + val slice = withReset(rst_L2) { + Module(new Slice()(p.alterPartial { + case EdgeInKey => edgeIn + case EdgeOutKey => edgeOut + case BankBitsKey => bankBits + case SliceIdKey => i + })) + } + slice.io.in <> in + if(enableHintGuidedGrant) { + // If the hint of slice X is selected at cycle T, then at cycle (T + 3) & (T + 4) + // we will try our best to select the grant of slice X. + // If slice X has no grant then, it means that the hint at cycle T is wrong, + // so we relax the restriction on grant selection. + val sliceCanFire = RegNextN(hintFire && i.U === hintChosen, sliceAhead) || + RegNextN(hintFire && i.U === hintChosen, sliceAhead + 1) + + releaseSourceD(i) := sliceCanFire && !slice.io.in.d.valid + + in.d.valid := slice.io.in.d.valid && (sliceCanFire || allCanFire) + slice.io.in.d.ready := in.d.ready && (sliceCanFire || allCanFire) + } + in.b.bits.address := restoreAddress(slice.io.in.b.bits.address, i) + out <> slice.io.out + out.a.bits.address := restoreAddress(slice.io.out.a.bits.address, i) + out.c.bits.address := restoreAddress(slice.io.out.c.bits.address, i) + slice.io.sliceId := i.U + + slice.io.prefetch.zip(prefetcher).foreach { + case (s, p) => + s.req.valid := p.io.req.valid && bank_eq(p.io.req.bits.set, i, bankBits) + s.req.bits := p.io.req.bits + prefetchReqsReady(i) := s.req.ready && bank_eq(p.io.req.bits.set, i, bankBits) + val train = Pipeline(s.train) + val resp = Pipeline(s.resp) + prefetchTrains.get(i) <> train + prefetchResps.get(i) <> resp + // restore to full address + if(bankBits != 0){ + val train_full_addr = Cat( + train.bits.tag, train.bits.set, i.U(bankBits.W), 0.U(offsetBits.W) + ) + val (train_tag, train_set, _) = s.parseFullAddress(train_full_addr) + val resp_full_addr = Cat( + resp.bits.tag, resp.bits.set, i.U(bankBits.W), 0.U(offsetBits.W) + ) + val (resp_tag, resp_set, _) = s.parseFullAddress(resp_full_addr) + prefetchTrains.get(i).bits.tag := train_tag + prefetchTrains.get(i).bits.set := train_set + prefetchResps.get(i).bits.tag := resp_tag + prefetchResps.get(i).bits.set := resp_set + } + s.tlb_req.req.valid := false.B + s.tlb_req.req.bits := DontCare + s.tlb_req.req_kill := DontCare + s.tlb_req.resp.ready := true.B + } + + slice + } + + if (enableHintGuidedGrant) { + // for timing consideration, hint should latch one cycle before sending to L1 + // instead of adding a Pipeline/Queue to latch here, we just set hintQueue in GrantBuf & CustomL1Hint "flow=false" + val l1HintArb = Module(new Arbiter(new L2ToL1Hint(), slices.size)) + val slices_l1Hint = slices.zipWithIndex.map { + case (s, i) => s.io.l1Hint + } + // should only Hint for DCache + val (sourceIsDcache, dcacheSourceIdStart) = node.in.head._2.client.clients + .filter(_.supports.probe) + .map(c => { + (c.sourceId.contains(l1HintArb.io.out.bits.sourceId).asInstanceOf[Bool], c.sourceId.start.U) + }).head + + l1HintArb.io.in <> VecInit(slices_l1Hint) + io.l2_hint.valid := l1HintArb.io.out.fire && sourceIsDcache + io.l2_hint.bits.sourceId := l1HintArb.io.out.bits.sourceId - dcacheSourceIdStart + io.l2_hint.bits.isKeyword := l1HintArb.io.out.bits.isKeyword + // continuous hints can only be sent every two cycle, since GrantData takes two cycles + l1HintArb.io.out.ready := !RegNext(io.l2_hint.valid, false.B) + + hintChosen := l1HintArb.io.chosen // ! THIS IS NOT ONE-HOT ! + hintFire := io.l2_hint.valid + } + + // ==================== TopDown ==================== + val topDown = topDownOpt.map(_ => Module(new TopDownMonitor()(p.alterPartial { + case EdgeInKey => node.in.head._2 + case EdgeOutKey => node.out.head._2 + case BankBitsKey => bankBits + }))) + topDown match { + case Some(t) => + t.io.msStatus.zip(slices).foreach { + case (in, s) => in := s.io.msStatus.get + } + t.io.dirResult.zip(slices).foreach { + case (res, s) => res := s.io.dirResult.get + } + t.io.latePF.zip(slices).foreach { + case (in, s) => in := s.io.latePF.get + } + t.io.debugTopDown <> io.debugTopDown + case None => io.debugTopDown.l2MissMatch := false.B + } + + // ==================== XSPerf Counters ==================== + val grant_data_fire = slices.map { slice => { + val (first, _, _, _) = node.in.head._2.count(slice.io.in.d) + slice.io.in.d.fire && first && slice.io.in.d.bits.opcode === GrantData + } + } + XSPerfAccumulate(cacheParams, "grant_data_fire", PopCount(VecInit(grant_data_fire))) + + val hint_source = io.l2_hint.bits.sourceId + + val grant_data_source = ParallelPriorityMux(slices.map { + s => (s.io.in.d.fire, s.io.in.d.bits.source) + }) + + val hintPipe2 = Module(new Pipeline(UInt(32.W), 2)) + hintPipe2.io.in.valid := io.l2_hint.valid + hintPipe2.io.in.bits := hint_source + hintPipe2.io.out.ready := true.B + + val hintPipe1 = Module(new Pipeline(UInt(32.W), 1)) + hintPipe1.io.in.valid := io.l2_hint.valid + hintPipe1.io.in.bits := hint_source + hintPipe1.io.out.ready := true.B + + val accurateHint = grant_data_fire.orR && hintPipe2.io.out.valid && hintPipe2.io.out.bits === grant_data_source + XSPerfAccumulate(cacheParams, "accurate3Hints", accurateHint) + + val okHint = grant_data_fire.orR && hintPipe1.io.out.valid && hintPipe1.io.out.bits === grant_data_source + XSPerfAccumulate(cacheParams, "ok2Hints", okHint) + } + + lazy val module = new CoupledL2Imp(this) +} \ No newline at end of file diff --git a/src/test/scala/TestProbeQueue.scala b/src/test/scala/TestProbeQueue.scala index 719925d2..3a3a294e 100644 --- a/src/test/scala/TestProbeQueue.scala +++ b/src/test/scala/TestProbeQueue.scala @@ -12,6 +12,7 @@ import freechips.rocketchip.diplomacy._ import freechips.rocketchip.tilelink._ import scala.collection.mutable.ArrayBuffer import huancun.DirtyField +import coupledL2.tl2tl._ diff --git a/src/test/scala/TestTop.scala b/src/test/scala/TestTop.scala index 7ec41f25..d6d49605 100644 --- a/src/test/scala/TestTop.scala +++ b/src/test/scala/TestTop.scala @@ -9,6 +9,7 @@ import freechips.rocketchip.tile.MaxHartIdBits import freechips.rocketchip.tilelink._ import huancun._ import coupledL2.prefetch._ +import coupledL2.tl2tl._ import utility.{ChiselDB, FileRegisters, TLLogger} @@ -56,7 +57,9 @@ class TestTop_L2()(implicit p: Parameters) extends LazyModule { val l1d_nodes = (0 until 1) map( i => createClientNode(s"l1d$i", 32)) val master_nodes = l1d_nodes - val l2 = LazyModule(new CoupledL2()) + val l2 = LazyModule(new TL2TLCoupledL2()(new Config((_, _, _) => { + case BankBitsKey => 0 + }))) val xbar = TLXbar() val ram = LazyModule(new TLRAM(AddressSet(0, 0xffffL), beatBytes = 32)) @@ -135,7 +138,7 @@ class TestTop_L2L3()(implicit p: Parameters) extends LazyModule { )) val master_nodes = Seq(l1d, l1i) - val l2 = LazyModule(new CoupledL2()(baseConfig(1).alterPartial({ + val l2 = LazyModule(new TL2TLCoupledL2()(baseConfig(1).alterPartial({ case L2ParamKey => L2Param( name = s"l2", ways = 4, @@ -147,6 +150,7 @@ class TestTop_L2L3()(implicit p: Parameters) extends LazyModule { rrTagBits = 6 )) ) + case BankBitsKey => 0 }))) val l3 = LazyModule(new HuanCun()(baseConfig(1).alterPartial({ @@ -263,7 +267,9 @@ class TestTop_L2_Standalone()(implicit p: Parameters) extends LazyModule { val l1d_nodes = (0 until 1) map( i => createClientNode(s"l1d$i", 32)) val master_nodes = l1d_nodes - val l2 = LazyModule(new CoupledL2()) + val l2 = LazyModule(new TL2TLCoupledL2()(new Config((_, _, _) => { + case BankBitsKey => 0 + }))) val xbar = TLXbar() val l3 = createManagerNode("Fake_L3", 16) @@ -337,7 +343,7 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule { val l1d_nodes = (0 until nrL2).map(i => createClientNode(s"l1d$i", 32)) val master_nodes = l1d_nodes - val coupledL2 = (0 until nrL2).map(i => LazyModule(new CoupledL2()(baseConfig(1).alterPartial({ + val coupledL2 = (0 until nrL2).map(i => LazyModule(new TL2TLCoupledL2()(baseConfig(1).alterPartial({ case L2ParamKey => L2Param( name = s"l2$i", ways = 4, @@ -346,6 +352,7 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule { echoField = Seq(DirtyField()), hartId = i ) + case BankBitsKey => 0 })))) val l2_nodes = coupledL2.map(_.node) @@ -373,11 +380,15 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule { val ram = LazyModule(new TLRAM(AddressSet(0, 0xffffL), beatBytes = 32)) l1d_nodes.zip(l2_nodes).zipWithIndex map { - case ((l1d, l2), i) => l2 := TLLogger(s"L2_L1_${i}", true) := TLBuffer() := l1d + case ((l1d, l2), i) => l2 := + TLLogger(s"L2_L1_${i}", !cacheParams.FPGAPlatform && cacheParams.enableTLLog) := + TLBuffer() := l1d } l2_nodes.zipWithIndex map { - case(l2, i) => xbar := TLLogger(s"L3_L2_${i}", true) := TLBuffer() := l2 + case(l2, i) => xbar := + TLLogger(s"L3_L2_${i}", !cacheParams.FPGAPlatform && cacheParams.enableTLLog) := + TLBuffer() := l2 } ram.node := @@ -385,7 +396,7 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule { TLFragmenter(32, 64) :=* TLCacheCork() :=* TLDelayer(delayFactor) :=* - TLLogger(s"MEM_L3", true) :=* + TLLogger(s"MEM_L3", !cacheParams.FPGAPlatform && cacheParams.enableTLLog) :=* l3.node :=* xbar lazy val module = new LazyModuleImp(this) { @@ -466,7 +477,7 @@ class TestTop_fullSys()(implicit p: Parameters) extends LazyModule { master_nodes = master_nodes ++ Seq(l1d, l1i) // TODO val l1xbar = TLXbar() - val l2 = LazyModule(new CoupledL2()(baseConfig(1).alterPartial({ + val l2 = LazyModule(new TL2TLCoupledL2()(baseConfig(1).alterPartial({ case L2ParamKey => L2Param( name = s"l2$i", ways = 4, @@ -478,6 +489,7 @@ class TestTop_fullSys()(implicit p: Parameters) extends LazyModule { rrTagBits = 6 )) ) + case BankBitsKey => 0 }))) l1xbar := TLBuffer() := l1i diff --git a/src/test/scala/chi/TestTop.scala b/src/test/scala/chi/TestTop.scala new file mode 100644 index 00000000..c77dfb98 --- /dev/null +++ b/src/test/scala/chi/TestTop.scala @@ -0,0 +1,258 @@ +package coupledL2 + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.cde.config._ +import chisel3.stage.{ChiselGeneratorAnnotation, ChiselStage} +import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.tile.MaxHartIdBits +import huancun._ +import coupledL2.prefetch._ +import coupledL2.tl2chi._ +import utility.{ChiselDB, FileRegisters, TLLogger} + +class TestTop_CHIL2(numCores: Int = 1, numULAgents: Int = 0, banks: Int = 1)(implicit p: Parameters) extends LazyModule + with HasCHIMsgParameters { + + /* L1D(L1I)* L1D(L1I)* ... L1D(L1I)* + * \ | / + * L2 L2 ... L2 + * \ | / + * \ | / + * CMN or VIP + */ + + override lazy val desiredName: String = "TestTop" + val delayFactor = 0.5 + val cacheParams = p(L2ParamKey) + + def createClientNode(name: String, sources: Int) = { + val masterNode = TLClientNode(Seq( + TLMasterPortParameters.v2( + masters = Seq( + TLMasterParameters.v1( + name = name, + sourceId = IdRange(0, sources), + supportsProbe = TransferSizes(cacheParams.blockBytes) + ) + ), + channelBytes = TLChannelBeatBytes(cacheParams.blockBytes), + minLatency = 1, + echoFields = Nil, + requestFields = Seq(AliasField(2)), + responseKeys = cacheParams.respKey + ) + )) + masterNode + } + + val l1d_nodes = (0 until numCores).map(i => createClientNode(s"l1d$i", 32)) + val l1i_nodes = (0 until numCores).map {i => + (0 until numULAgents).map { j => + TLClientNode(Seq( + TLMasterPortParameters.v1( + clients = Seq(TLMasterParameters.v1( + name = s"l1i${i}_${j}", + sourceId = IdRange(0, 32) + )) + ) + )) + } + } + + // val l2 = LazyModule(new TL2CHICoupledL2()) + val l2_nodes = (0 until numCores).map(i => LazyModule(new TL2CHICoupledL2()(new Config((_, _, _) => { + case L2ParamKey => L2Param( + name = s"l2$i", + ways = 4, + sets = 128, + clientCaches = Seq(L1Param(aliasBitsOpt = Some(2))), + // echoField = Seq(DirtyField), + enablePerf = false, + enableRollingDB = false, + enableMonitor = false, + enableTLLog = false, + elaboratedTopDown = false, + FPGAPlatform = false, + // SAM for CMN 2X2 Mesh + // sam = Seq( + // AddressSet(0x0L, 0xfffffffbfL) -> 8, + // AddressSet(0x40L, 0xfffffffbfL) -> 40 + // ) + hartId = i + ) + case EnableCHI => true + case BankBitsKey => log2Ceil(banks) + case MaxHartIdBits => log2Up(numCores) + })))) + + val bankBinders = (0 until numCores).map(_ => BankBinder(banks, 64)) + + l1d_nodes.zip(l2_nodes).zipWithIndex.foreach { case ((l1d, l2), i) => + val l1xbar = TLXbar() + l1xbar := + TLLogger(s"L2_L1_CORE${i}_TLC", !cacheParams.FPGAPlatform && cacheParams.enableTLLog) := + TLBuffer() := l1d + + l1i_nodes(i).zipWithIndex.foreach { case (l1i, j) => + l1xbar := + TLLogger(s"L2_L1_CORE${i}_TLUL${j}", !cacheParams.FPGAPlatform && cacheParams.enableTLLog) := + TLBuffer() := l1i + } + + l2.managerNode := + TLXbar() :=* + bankBinders(i) :*= + l2.node :*= + l1xbar + /** + * MMIO: make diplomacy happy + */ + val mmioClientNode = TLClientNode(Seq( + TLMasterPortParameters.v1( + clients = Seq(TLMasterParameters.v1( + "uncache" + )) + ) + )) + l2.mmioBridge.mmioNode := mmioClientNode + } + + lazy val module = new LazyModuleImp(this){ + val timer = WireDefault(0.U(64.W)) + val logEnable = WireDefault(false.B) + val clean = WireDefault(false.B) + val dump = WireDefault(false.B) + + dontTouch(timer) + dontTouch(logEnable) + dontTouch(clean) + dontTouch(dump) + + l1d_nodes.zipWithIndex.foreach{ + case (node, i) => + node.makeIOs()(ValName(s"master_port_$i")) + } + if (numULAgents != 0) { + l1i_nodes.zipWithIndex.foreach { case (core, i) => + core.zipWithIndex.foreach { case (node, j) => + node.makeIOs()(ValName(s"master_ul_port_${i}_${j}")) + } + } + } + + val io = IO(Vec(numCores, new Bundle() { + val chi = new PortIO + })) + + l2_nodes.zipWithIndex.foreach { case (l2, i) => + l2.module.io.chi <> io(i).chi + dontTouch(l2.module.io) + + l2.module.io.hartId := i.U + l2.module.io.nodeID := i.U(NODEID_WIDTH.W) + l2.module.io.debugTopDown := DontCare + l2.module.io.l2_tlb_req <> DontCare + } + } + +} + + +object TestTopCHIHelper { + def gen(fTop: Parameters => TestTop_CHIL2)(args: Array[String]) = { + val config = new Config((_, _, _) => { + case L2ParamKey => L2Param( + FPGAPlatform = true + ) + }) + + val top = DisableMonitors(p => LazyModule(fTop(p)))(config) + + (new ChiselStage).execute(args, Seq( + ChiselGeneratorAnnotation(() => top.module) + )) + + ChiselDB.init(false) + ChiselDB.addToFileRegisters + FileRegisters.write("./build") + } +} + + +object TestTop_CHI_DualCore_0UL extends App { + + TestTopCHIHelper.gen(p => new TestTop_CHIL2( + numCores = 2, + numULAgents = 0, + banks = 1)(p) + )(args) +} + +object TestTop_CHI_DualCore_2UL extends App { + + TestTopCHIHelper.gen(p => new TestTop_CHIL2( + numCores = 2, + numULAgents = 0, + banks = 1)(p) + )(args) +} + + + +object TestTop_CHI_QuadCore_0UL extends App { + + TestTopCHIHelper.gen(p => new TestTop_CHIL2( + numCores = 4, + numULAgents = 0, + banks = 1)(p) + )(args) +} + +object TestTop_CHI_QuadCore_2UL extends App { + + TestTopCHIHelper.gen(p => new TestTop_CHIL2( + numCores = 4, + numULAgents = 2, + banks = 1)(p) + )(args) +} + + +object TestTop_CHI_OctaCore_0UL extends App { + + TestTopCHIHelper.gen(p => new TestTop_CHIL2( + numCores = 8, + numULAgents = 0, + banks = 1)(p) + )(args) +} + +object TestTop_CHI_OctaCore_2UL extends App { + + TestTopCHIHelper.gen(p => new TestTop_CHIL2( + numCores = 8, + numULAgents = 2, + banks = 1)(p) + )(args) +} + + +object TestTop_CHI_HexaCore_0UL extends App { + + TestTopCHIHelper.gen(p => new TestTop_CHIL2( + numCores = 16, + numULAgents = 0, + banks = 1)(p) + )(args) +} + +object TestTop_CHI_HexaCore_2UL extends App { + + TestTopCHIHelper.gen(p => new TestTop_CHIL2( + numCores = 16, + numULAgents = 2, + banks = 1)(p) + )(args) +}