Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GrantBuffer: Modify assert 'inflightGrant entries should not be full'… #154

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence,
# @global-owner1 and @global-owner2 will be requested for
# review when someone opens a pull request.
* @Ivyfeather @linjuanZ
12 changes: 8 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -6,9 +6,9 @@ name: CI
on:
# Triggers the workflow on push or pull request events but only for the main branch
push:
branches: [ master, ci-test ]
branches: [ master, chi-coupledl2 ]
pull_request:
branches: [ master, ci-test ]
branches: [ master, chi-coupledl2 ]

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
@@ -47,8 +47,8 @@ jobs:

- name: Compile
run: make compile

- name: Unit test
- name: Unit test for TileLink version
run: |
git clone https://github.com/OpenXiangShan/tl-test -b coupledL2-huancun
make test-top-l2l3l2
@@ -57,3 +57,7 @@ jobs:
cmake .. -DDUT_DIR=../../build -DCHISELDB=1
make
./tlc_test -s $RANDOM

- name: Compile CHI QuadCore
run: |
make test-top-chi-quadcore-2ul
24 changes: 24 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -20,6 +20,30 @@ test-top-l2l3l2:
test-top-fullsys:
mill -i CoupledL2.test.runMain coupledL2.TestTop_fullSys -td build

test-top-chi-dualcore-0ul:
mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_DualCore_0UL -td build

test-top-chi-dualcore-2ul:
mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_DualCore_2UL -td build

test-top-chi-quadcore-0ul:
mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_QuadCore_0UL -td build

test-top-chi-quadcore-2ul:
mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_QuadCore_2UL -td build

test-top-chi-octacore-0ul:
mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_OctaCore_0UL -td build

test-top-chi-octacore-2ul:
mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_OctaCore_2UL -td build

test-top-chi-hexacore-0ul:
mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_HexaCore_0UL -td build

test-top-chi-hexacore-2ul:
mill -i CoupledL2.test.runMain coupledL2.TestTop_CHI_HexaCore_2UL -td build

clean:
rm -rf ./build

115 changes: 78 additions & 37 deletions src/main/scala/coupledL2/Common.scala
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@ import chisel3.util._
import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.tilelink.TLPermissions._
import utility.MemReqSource
import tl2chi.{HasCHIMsgParameters, HasCHIChannelBits, CHIREQ, MemAttr, OrderEncodings}

abstract class L2Module(implicit val p: Parameters) extends Module with HasCoupledL2Parameters
abstract class L2Bundle(implicit val p: Parameters) extends Bundle with HasCoupledL2Parameters
@@ -32,7 +33,7 @@ class ReplacerInfo(implicit p: Parameters) extends L2Bundle {
val reqSource = UInt(MemReqSource.reqSourceBits.W)
}

trait HasChannelBits { this: Bundle =>
trait HasTLChannelBits { this: Bundle =>
val channel = UInt(3.W)
def fromA = channel(0).asBool
def fromB = channel(1).asBool
@@ -52,7 +53,10 @@ class MergeTaskBundle(implicit p: Parameters) extends L2Bundle {

// We generate a Task for every TL request
// this is the info that flows in Mainpipe
class TaskBundle(implicit p: Parameters) extends L2Bundle with HasChannelBits {
class TaskBundle(implicit p: Parameters) extends L2Bundle
with HasTLChannelBits
with HasCHIMsgParameters
with HasCHIChannelBits {
val set = UInt(setBits.W)
val tag = UInt(tagBits.W)
val off = UInt(offsetBits.W)
@@ -104,9 +108,47 @@ class TaskBundle(implicit p: Parameters) extends L2Bundle with HasChannelBits {
// for merged MSHR tasks(Acquire & late Prefetch)
val mergeA = Bool()
val aMergeTask = new MergeTaskBundle()

// Used for get data from ReleaseBuf when snoop hit with same PA
val snpHitRelease = Bool()
val snpHitReleaseWithData = Bool()
val snpHitReleaseIdx = UInt(mshrBits.W)
// CHI
val tgtID = chiOpt.map(_ => UInt(TGTID_WIDTH.W))
val srcID = chiOpt.map(_ => UInt(SRCID_WIDTH.W))
val txnID = chiOpt.map(_ => UInt(TXNID_WIDTH.W))
val homeNID = chiOpt.map(_ => UInt(SRCID_WIDTH.W))
val dbID = chiOpt.map(_ => UInt(DBID_WIDTH.W))
val fwdNID = chiOpt.map(_ => UInt(FWDNID_WIDTH.W))
val fwdTxnID = chiOpt.map(_ => UInt(FWDTXNID_WIDTH.W))
val chiOpcode = chiOpt.map(_ => UInt(OPCODE_WIDTH.W))
val resp = chiOpt.map(_ => UInt(RESP_WIDTH.W))
val fwdState = chiOpt.map(_ => UInt(FWDSTATE_WIDTH.W))
val pCrdType = chiOpt.map(_ => UInt(PCRDTYPE_WIDTH.W))
val retToSrc = chiOpt.map(_ => Bool()) // only used in snoop
val expCompAck = chiOpt.map(_ => Bool())
val allowRetry = chiOpt.map(_ => Bool())
val memAttr = chiOpt.map(_ => new MemAttr)

def toCHIREQBundle(): CHIREQ = {
val req = WireInit(0.U.asTypeOf(new CHIREQ()))
req.tgtID := tgtID.getOrElse(0.U)
req.srcID := srcID.getOrElse(0.U)
req.txnID := txnID.getOrElse(0.U)
req.opcode := chiOpcode.getOrElse(0.U)
req.addr := Cat(tag, set, 0.U(offsetBits.W))
req.allowRetry := allowRetry.getOrElse(true.B) //TODO: consider retry
req.pCrdType := pCrdType.getOrElse(0.U)
req.expCompAck := expCompAck.getOrElse(false.B)
req.memAttr := memAttr.getOrElse(MemAttr())
req.snpAttr := true.B
req.order := OrderEncodings.None
req
}
}

class PipeStatus(implicit p: Parameters) extends L2Bundle with HasChannelBits
class PipeStatus(implicit p: Parameters) extends L2Bundle
with HasTLChannelBits

class PipeEntranceStatus(implicit p: Parameters) extends L2Bundle {
val tags = Vec(4, UInt(tagBits.W))
@@ -123,34 +165,6 @@ class PipeEntranceStatus(implicit p: Parameters) extends L2Bundle {
def g_set = sets(3)
}

// MSHR exposes signals to MSHRCtl
class MSHRStatus(implicit p: Parameters) extends L2Bundle with HasChannelBits {
val set = UInt(setBits.W)
val reqTag = UInt(tagBits.W)
val metaTag = UInt(tagBits.W)
val needsRepl = Bool()
val w_c_resp = Bool()
val w_d_resp = Bool()
val will_free = Bool()

// val way = UInt(wayBits.W)
// val off = UInt(offsetBits.W)
// val opcode = UInt(3.W)
// val param = UInt(3.W)
// val size = UInt(msgSizeBits.W)
// val source = UInt(sourceIdBits.W)
// val alias = aliasBitsOpt.map(_ => UInt(aliasBitsOpt.get.W))
// val aliasTask = aliasBitsOpt.map(_ => Bool())
// val needProbeAckData = Bool() // only for B reqs
// val fromL2pft = prefetchOpt.map(_ => Bool())
// val needHint = prefetchOpt.map(_ => Bool())

// for TopDown usage
val reqSource = UInt(MemReqSource.reqSourceBits.W)
val is_miss = Bool()
val is_prefetch = Bool()
}

// MSHR Task that MainPipe sends to MSHRCtl
class MSHRRequest(implicit p: Parameters) extends L2Bundle {
val dirResult = new DirResult()
@@ -159,11 +173,12 @@ class MSHRRequest(implicit p: Parameters) extends L2Bundle {
}

// MSHR info to ReqBuf and SinkB
class MSHRInfo(implicit p: Parameters) extends L2Bundle {
class MSHRInfo(implicit p: Parameters) extends L2Bundle with HasTLChannelBits {
val set = UInt(setBits.W)
val way = UInt(wayBits.W)
val reqTag = UInt(tagBits.W)
val willFree = Bool()
val aliasTask = aliasBitsOpt.map(_ => Bool())

// to block Acquire for to-be-replaced data until Release done (indicated by ReleaseAck received)
val needRelease = Bool()
@@ -172,28 +187,42 @@ class MSHRInfo(implicit p: Parameters) extends L2Bundle {
val blockRefill = Bool()

val metaTag = UInt(tagBits.W)
val metaState = UInt(stateBits.W)
val dirHit = Bool()

// decide whether can nest B (req same-addr)
val nestB = Bool()

// to drop duplicate prefetch reqs
val isAcqOrPrefetch = Bool()
val isPrefetch = Bool()

// whether the mshr_task already in mainpipe
val s_refill = Bool()
val param = UInt(3.W)
val mergeA = Bool() // whether the mshr already merge an acquire(avoid alias merge)

val w_grantfirst = Bool()
val s_refill = Bool()
val w_releaseack = Bool()
val w_replResp = Bool()
val w_rprobeacklast = Bool()

val replaceData = Bool() // If there is a replace, WriteBackFull or Evict
}

class RespInfoBundle(implicit p: Parameters) extends L2Bundle {
class RespInfoBundle(implicit p: Parameters) extends L2Bundle
with HasCHIMsgParameters
{
val opcode = UInt(3.W)
val param = UInt(3.W)
val last = Bool() // last beat
val dirty = Bool() // only used for sinkD resps
val isHit = Bool() // only used for sinkD resps
//CHI
val chiOpcode = chiOpt.map(_ => UInt(OPCODE_WIDTH.W))
val txnID = chiOpt.map(_ => UInt(TXNID_WIDTH.W))
val srcID = chiOpt.map(_ => UInt(SRCID_WIDTH.W))
val homeNID = chiOpt.map(_ => UInt(SRCID_WIDTH.W))
val dbID = chiOpt.map(_ => UInt(DBID_WIDTH.W))
val resp = chiOpt.map(_ => UInt(RESP_WIDTH.W))
val pCrdType = chiOpt.map(_ => UInt(PCRDTYPE_WIDTH.W))
}

class RespBundle(implicit p: Parameters) extends L2Bundle {
@@ -227,6 +256,12 @@ class FSMState(implicit p: Parameters) extends L2Bundle {
val w_grant = Bool()
val w_releaseack = Bool()
val w_replResp = Bool()

// CHI
val s_compack = chiOpt.map(_ => Bool())
val s_cbwrdata = chiOpt.map(_ => Bool())
val s_reissue = chiOpt.map(_ => Bool())
val s_dct = chiOpt.map(_ => Bool())
}

class SourceAReq(implicit p: Parameters) extends L2Bundle {
@@ -260,7 +295,13 @@ class BlockInfo(implicit p: Parameters) extends L2Bundle {
class NestedWriteback(implicit p: Parameters) extends L2Bundle {
val set = UInt(setBits.W)
val tag = UInt(tagBits.W)
// Nested ReleaseData sets block dirty
val c_set_dirty = Bool()
// Nested Snoop invalidates block
val b_inv_dirty = Bool()

val b_toB = chiOpt.map(_ => Bool())
val b_toN = chiOpt.map(_ => Bool())
}

class PrefetchRecv extends Bundle {
6 changes: 6 additions & 0 deletions src/main/scala/coupledL2/Consts.scala
Original file line number Diff line number Diff line change
@@ -30,6 +30,11 @@ object MetaData {
def TRUNK: UInt = 2.U(stateBits.W) // unique inner master cache is trunk
def TIP: UInt = 3.U(stateBits.W) // we are trunk, inner masters are branch

def needB(opcode: UInt, param: UInt): Bool = {
opcode === TLMessages.Get ||
opcode === TLMessages.AcquireBlock && param === TLPermissions.NtoB ||
opcode === TLMessages.Hint && param === TLHints.PREFETCH_READ
}
// Does a request need trunk to be handled?
def needT(opcode: UInt, param: UInt): Bool = {
!opcode(2) ||
@@ -64,4 +69,5 @@ object MetaData {
Seq(INVALID, INVALID, BRANCH)
)
}
def isValid(state: UInt): Bool = state > INVALID
}
372 changes: 44 additions & 328 deletions src/main/scala/coupledL2/CoupledL2.scala

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions src/main/scala/coupledL2/DataStorage.scala
Original file line number Diff line number Diff line change
@@ -51,7 +51,8 @@ class DataStorage(implicit p: Parameters) extends L2Module {
gen = new DSBlock,
set = blocks,
way = 1,
singlePort = true
singlePort = true,
holdRead = true
))

val arrayIdx = Cat(io.req.bits.way, io.req.bits.set)
@@ -60,7 +61,10 @@ class DataStorage(implicit p: Parameters) extends L2Module {
array.io.w.apply(wen, io.wdata, arrayIdx, 1.U)
array.io.r.apply(ren, arrayIdx)

// TODO: timing: we should not use reg here, instead set this as multicycle path
// for timing, we set this as multicycle path
// s3 read, s4 pass and s5 to destination
io.rdata := RegNextN(array.io.r.resp.data(0), 1)
io.rdata := array.io.r.resp.data(0)

assert(!io.req.valid || !RegNext(io.req.valid, false.B),
"Continuous SRAM req prohibited under MCP2!")
}
15 changes: 11 additions & 4 deletions src/main/scala/coupledL2/GrantBuffer.scala
Original file line number Diff line number Diff line change
@@ -109,8 +109,9 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
}))

val dtaskOpcode = io.d_task.bits.task.opcode
val mergeAtask = Wire(new TaskBundle())
val mergeAtask = WireInit(0.U.asTypeOf(new TaskBundle()))
mergeAtask.channel := io.d_task.bits.task.channel
mergeAtask.txChannel := io.d_task.bits.task.txChannel
mergeAtask.off := io.d_task.bits.task.aMergeTask.off
mergeAtask.alias.foreach(_ := io.d_task.bits.task.aMergeTask.alias.getOrElse(0.U))
mergeAtask.opcode := io.d_task.bits.task.aMergeTask.opcode
@@ -252,7 +253,7 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
entry.bits.tag := io.d_task.bits.task.tag
}
val inflight_full = Cat(inflightGrant.map(_.valid)).andR
assert(!inflight_full, "inflightGrant entries should not be full")
assert(!(inflight_full & (io.d_task.fire && (dtaskOpcode(2, 1) === Grant(2, 1) || io.d_task.bits.task.mergeA))), "inflightGrant entries overflow")

// report status to SourceB to block same-addr Probe
io.grantStatus zip inflightGrant foreach {
@@ -275,10 +276,16 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
val noSpaceForSinkReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s =>
s.valid && (s.bits.fromA || s.bits.fromC)
}).asUInt) + grantQueueCnt >= mshrsAll.U
val noSpaceWaitSinkEForSinkReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s =>
s.valid && s.bits.fromA
}).asUInt) + PopCount(VecInit(inflightGrant.map(x => x.valid))) >= mshrsAll.U
// for timing consideration, drop s1 info, so always reserve one entry for it
val noSpaceForMSHRReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s =>
s.valid && (s.bits.fromA || s.bits.fromC)
}).asUInt) + grantQueueCnt >= (mshrsAll-1).U
val noSpaceWaitSinkEForMSHRReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s =>
s.valid && s.bits.fromA
}).asUInt) + PopCount(VecInit(inflightGrant.map(x => x.valid))) >= (mshrsAll - 1).U
// pftRespQueue also requires back pressure to ensure that it will not exceed capacity
// Ideally, it should only block Prefetch from entering MainPipe
// But since it is extremely rare that pftRespQueue of 10 would be full, we just block all Entrance here, simpler logic
@@ -290,14 +297,14 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
s.valid && s.bits.fromA
}).asUInt) + pftRespQueue.get.io.count >= (pftQueueLen-1).U)

io.toReqArb.blockSinkReqEntrance.blockA_s1 := noSpaceForSinkReq || noSpaceForSinkPft.getOrElse(false.B)
io.toReqArb.blockSinkReqEntrance.blockA_s1 := noSpaceForSinkReq || noSpaceWaitSinkEForSinkReq || noSpaceForSinkPft.getOrElse(false.B)
io.toReqArb.blockSinkReqEntrance.blockB_s1 := Cat(inflightGrant.map(g => g.valid &&
g.bits.set === io.fromReqArb.status_s1.b_set && g.bits.tag === io.fromReqArb.status_s1.b_tag)).orR
//TODO: or should we still Stall B req?
// A-replace related rprobe is handled in SourceB
io.toReqArb.blockSinkReqEntrance.blockC_s1 := noSpaceForSinkReq
io.toReqArb.blockSinkReqEntrance.blockG_s1 := false.B // this is not used
io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq || noSpaceForMSHRPft.getOrElse(false.B)
io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq || noSpaceWaitSinkEForMSHRReq || noSpaceForMSHRPft.getOrElse(false.B)

// =========== XSPerf ===========
if (cacheParams.enablePerf) {
22 changes: 15 additions & 7 deletions src/main/scala/coupledL2/L2Param.scala
Original file line number Diff line number Diff line change
@@ -19,16 +19,15 @@ package coupledL2

import chisel3._
import chisel3.util.log2Ceil
import freechips.rocketchip.diplomacy.BufferParams
import freechips.rocketchip.diplomacy.{BufferParams, AddressSet}
import freechips.rocketchip.tilelink._
import freechips.rocketchip.util._
import org.chipsalliance.cde.config.Field
import huancun.{AliasKey, CacheParameters, IsHitKey, PrefetchKey}
import coupledL2.prefetch._
import utility.{MemReqSource, ReqSourceKey}

// General parameter key of CoupledL2
case object L2ParamKey extends Field[L2Param](L2Param())
case object EnableCHI extends Field[Boolean](false)

// L1 Cache Params, used for TestTop generation
case class L1Param
@@ -55,8 +54,7 @@ case class VaddrField(width: Int) extends BundleField[UInt](VaddrKey, Output(UIn
case object IsKeywordKey extends ControlKey[Bool]("isKeyword")
case class IsKeywordField() extends BundleField[Bool](IsKeywordKey, Output(Bool()), _ := false.B)

case class L2Param
(
case class L2Param(
name: String = "L2",
ways: Int = 4,
sets: Int = 128,
@@ -72,8 +70,9 @@ case class L2Param
* 2 for all except prefetch & !accessed
* 3 for all
*/
mmioBridgeSize: Int = 8,

// Client (these are set in Configs.scala in XiangShan)
// Client
echoField: Seq[BundleFieldBase] = Nil,
reqField: Seq[BundleFieldBase] = Nil,
respKey: Seq[BundleKeyBase] = Seq(IsHitKey),
@@ -99,10 +98,15 @@ case class L2Param
enableRollingDB: Boolean = true,
// Monitor
enableMonitor: Boolean = true,
// TLLog
enableTLLog: Boolean = true,
// TopDown
elaboratedTopDown: Boolean = true,
// env
FPGAPlatform: Boolean = false
FPGAPlatform: Boolean = false,

// Network layer SAM
sam: Seq[(AddressSet, Int)] = Seq(AddressSet.everything -> 33)
) {
def toCacheParams: CacheParameters = CacheParameters(
name = name,
@@ -113,10 +117,14 @@ case class L2Param
)
}

case object L2ParamKey extends Field[L2Param](L2Param())

case object EdgeInKey extends Field[TLEdgeIn]

case object EdgeOutKey extends Field[TLEdgeOut]

case object BankBitsKey extends Field[Int]

case object L2NBanksKey extends Field[Int]

case object SliceIdKey extends Field[Int]
12 changes: 8 additions & 4 deletions src/main/scala/coupledL2/MSHRBuffer.scala
Original file line number Diff line number Diff line change
@@ -21,7 +21,6 @@ import chisel3._
import chisel3.util._
import org.chipsalliance.cde.config.Parameters
import coupledL2.utils._
import java.util.ResourceBundle

class MSHRBufRead(implicit p: Parameters) extends L2Bundle {
val id = Output(UInt(mshrBits.W))
@@ -34,6 +33,7 @@ class MSHRBufResp(implicit p: Parameters) extends L2Bundle {
class MSHRBufWrite(implicit p: Parameters) extends L2Bundle {
val id = Output(UInt(mshrBits.W))
val data = Output(new DSBlock)
val beatMask = Output(UInt(beatSize.W))
}

// MSHR Buffer is used when MSHR needs to save data, so each buffer entry corresponds to an MSHR
@@ -44,21 +44,25 @@ class MSHRBuffer(wPorts: Int = 1)(implicit p: Parameters) extends L2Module {
val w = Vec(wPorts, Flipped(ValidIO(new MSHRBufWrite)))
})

val buffer = Reg(Vec(mshrsAll, new DSBlock))
val buffer = Reg(Vec(mshrsAll, Vec(beatSize, UInt((beatBytes * 8).W))))

buffer.zipWithIndex.foreach {
case (block, i) =>
val wens = VecInit(io.w.map(w => w.valid && w.bits.id === i.U)).asUInt
assert(PopCount(wens) <= 2.U, "triple write to the same MSHR buffer entry")

val w_data = PriorityMux(wens, io.w.map(_.bits.data))
val w_beatSel = PriorityMux(wens, io.w.map(_.bits.beatMask))
when(wens.orR) {
block := w_data
// block := w_data
block.zip(w_beatSel.asBools).zipWithIndex.foreach { case ((beat, sel), i) =>
when (sel) { beat := w_data.data((i+1) * beatBytes * 8 - 1, i * beatBytes * 8) }
}
}
}

val ridReg = RegEnable(io.r.bits.id, 0.U(mshrBits.W), io.r.valid)
io.resp.data := buffer(ridReg)
io.resp.data.data := buffer(ridReg).asUInt
}

// may consider just choose an empty entry to insert
139 changes: 110 additions & 29 deletions src/main/scala/coupledL2/RequestArb.scala
Original file line number Diff line number Diff line change
@@ -24,6 +24,9 @@ import freechips.rocketchip.tilelink._
import freechips.rocketchip.tilelink.TLMessages._
import org.chipsalliance.cde.config.Parameters
import coupledL2.utils.XSPerfAccumulate
import coupledL2.tl2tl._
import coupledL2.tl2chi._
import coupledL2.tl2chi.CHIOpcode._

class RequestArb(implicit p: Parameters) extends L2Module {
val io = IO(new Bundle() {
@@ -53,7 +56,8 @@ class RequestArb(implicit p: Parameters) extends L2Module {

/* status of each pipeline stage */
val status_s1 = Output(new PipeEntranceStatus) // set & tag of entrance status
val status_vec = Vec(2, ValidIO(new PipeStatus)) // whether this stage will flow into SourceD
val status_vec = Vec(2, ValidIO(new PipeStatus))
val status_vec_toTX = if (enableCHI) Some(Vec(2, ValidIO(new PipeStatusWithCHI))) else None

/* handle set conflict, capacity conflict */
val fromMSHRCtl = Input(new BlockInfo())
@@ -62,10 +66,13 @@ class RequestArb(implicit p: Parameters) extends L2Module {
val blockSinkReqEntrance = new BlockInfo()
val blockMSHRReqEntrance = Bool()
})
val fromSourceC = Input(new Bundle() {
val blockSinkBReqEntrance = Bool()
val blockMSHRReqEntrance = Bool()
})
val fromSourceC = if (!enableCHI) Some(Input(new SourceCBlockBundle)) else None
val fromTXDAT = if (enableCHI) Some(Input(new TXDATBlockBundle)) else None
val fromTXRSP = if (enableCHI) Some(Input(new TXRSPBlockBundle)) else None
val fromTXREQ = if (enableCHI) Some(Input(new TXBlockBundle)) else None

/* MSHR Status */
val msInfo = Vec(mshrsAll, Flipped(ValidIO(new MSHRInfo())))
})

/* ======== Reset ======== */
@@ -79,7 +86,10 @@ class RequestArb(implicit p: Parameters) extends L2Module {
resetFinish := true.B
}

val mshr_task_s0 = Wire(Valid(new TaskBundle()))
val s0_fire = Wire(Bool())
val s1_fire = Wire(Bool())
val s1_cango = Wire(Bool())
val s2_ready = Wire(Bool())
val mshr_task_s1 = RegInit(0.U.asTypeOf(Valid(new TaskBundle())))

val s1_needs_replRead = mshr_task_s1.valid && mshr_task_s1.bits.fromA && mshr_task_s1.bits.replTask && (
@@ -91,35 +101,48 @@ class RequestArb(implicit p: Parameters) extends L2Module {
/* ======== Stage 0 ======== */
// if mshr_task_s1 is replRead, it might stall and wait for dirRead.ready, so we block new mshrTask from entering
// TODO: will cause msTask path vacant for one-cycle after replRead, since not use Flow so as to avoid ready propagation
io.mshrTask.ready := !io.fromGrantBuffer.blockMSHRReqEntrance && !s1_needs_replRead && !io.fromSourceC.blockMSHRReqEntrance
mshr_task_s0.valid := io.mshrTask.fire
mshr_task_s0.bits := io.mshrTask.bits
io.mshrTask.ready := !io.fromGrantBuffer.blockMSHRReqEntrance && !s1_needs_replRead && !(mshr_task_s1.valid && !s2_ready)
(if (io.fromSourceC.isDefined) !io.fromSourceC.get.blockMSHRReqEntrance else true.B) &&
(if (io.fromTXDAT.isDefined) !io.fromTXDAT.get.blockMSHRReqEntrance else true.B) &&
(if (io.fromTXRSP.isDefined) !io.fromTXRSP.get.blockMSHRReqEntrance else true.B) &&
(if (io.fromTXREQ.isDefined) !io.fromTXREQ.get.blockMSHRReqEntrance else true.B)

s0_fire := io.mshrTask.valid && io.mshrTask.ready

/* ======== Stage 1 ======== */
/* latch mshr_task from s0 to s1 */
val mshr_replRead_stall = mshr_task_s1.valid && s1_needs_replRead && (!io.dirRead_s1.ready || io.fromMainPipe.blockG_s1)
mshr_task_s1.valid := mshr_task_s1.valid && !s1_fire || s0_fire

mshr_task_s1.valid := mshr_task_s0.valid || mshr_replRead_stall
when(mshr_task_s0.valid && !mshr_replRead_stall) {
mshr_task_s1.bits := mshr_task_s0.bits
when (s0_fire) {
mshr_task_s1.bits := io.mshrTask.bits
}


/* Channel interaction from s1 */
val A_task = io.sinkA.bits
val B_task = io.sinkB.bits
val C_task = io.sinkC.bits
val block_A = io.fromMSHRCtl.blockA_s1 || io.fromMainPipe.blockA_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockA_s1
val block_B = io.fromMSHRCtl.blockB_s1 || io.fromMainPipe.blockB_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockB_s1 || io.fromSourceC.blockSinkBReqEntrance
val block_B = io.fromMSHRCtl.blockB_s1 || io.fromMainPipe.blockB_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockB_s1 ||
(if (io.fromSourceC.isDefined) io.fromSourceC.get.blockSinkBReqEntrance else false.B) ||
(if (io.fromTXDAT.isDefined) io.fromTXDAT.get.blockSinkBReqEntrance else false.B) ||
(if (io.fromTXRSP.isDefined) io.fromTXRSP.get.blockSinkBReqEntrance else false.B)
val block_C = io.fromMSHRCtl.blockC_s1 || io.fromMainPipe.blockC_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockC_s1

val noFreeWay = Wire(Bool())

val sinkValids = VecInit(Seq(
io.sinkC.valid && !block_C,
io.sinkB.valid && !block_B,
io.sinkA.valid && !block_A
io.sinkA.valid && !block_A && !noFreeWay
)).asUInt

val sink_ready_basic = io.dirRead_s1.ready && resetFinish && !mshr_task_s1.valid
io.sinkA.ready := sink_ready_basic && !block_A && !sinkValids(1) && !sinkValids(0) // SinkC prior to SinkA & SinkB
// TODO: A Hint is allowed to enter if !s2_ready for mcp2_stall

val sink_ready_basic = io.dirRead_s1.ready && resetFinish && !mshr_task_s1.valid && s2_ready

io.sinkA.ready := sink_ready_basic && !block_A && !sinkValids(1) && !sinkValids(0) && !noFreeWay // SinkC prior to SinkA & SinkB
io.sinkB.ready := sink_ready_basic && !block_B && !sinkValids(0) // SinkB prior to SinkA
io.sinkC.ready := sink_ready_basic && !block_C

@@ -132,12 +155,15 @@ class RequestArb(implicit p: Parameters) extends L2Module {
val task_s1 = Mux(mshr_task_s1.valid, mshr_task_s1, chnl_task_s1)
val s1_to_s2_valid = task_s1.valid && !mshr_replRead_stall

io.taskInfo_s1.valid := s1_to_s2_valid
s1_cango := task_s1.valid && !mshr_replRead_stall
s1_fire := s1_cango && s2_ready

io.taskInfo_s1.valid := s1_fire
io.taskInfo_s1.bits := task_s1.bits

/* Meta read request */
// ^ only sinkA/B/C tasks need to read directory
io.dirRead_s1.valid := chnl_task_s1.valid && !mshr_task_s1.valid || s1_needs_replRead && !io.fromMainPipe.blockG_s1
io.dirRead_s1.valid := s2_ready && (chnl_task_s1.valid && !mshr_task_s1.valid || s1_needs_replRead && !io.fromMainPipe.blockG_s1)
io.dirRead_s1.bits.set := task_s1.bits.set
io.dirRead_s1.bits.tag := task_s1.bits.tag
// invalid way which causes mshr_retry
@@ -150,17 +176,30 @@ class RequestArb(implicit p: Parameters) extends L2Module {
io.dirRead_s1.bits.mshrId := task_s1.bits.mshrId

// block same-set A req
io.s1Entrance.valid := mshr_task_s1.valid && mshr_task_s1.bits.metaWen || io.sinkC.fire || io.sinkB.fire
io.s1Entrance.valid := mshr_task_s1.valid && s2_ready && mshr_task_s1.bits.metaWen || io.sinkC.fire || io.sinkB.fire
io.s1Entrance.bits.set := Mux(
mshr_task_s1.valid && mshr_task_s1.bits.metaWen,
mshr_task_s1.bits.set,
Mux(io.sinkC.fire, C_task.set, B_task.set)
)

/* ======== Stage 2 ======== */
val s1_AHint_fire = io.sinkA.fire && io.sinkA.bits.opcode === Hint
// any req except AHint might access DS, and continuous DS accesses are prohibited
val ds_mcp2_stall = RegNext(s1_fire && !s1_AHint_fire)

s2_ready := !ds_mcp2_stall

val task_s2 = RegInit(0.U.asTypeOf(task_s1))
task_s2.valid := s1_to_s2_valid
when(s1_to_s2_valid) { task_s2.bits := task_s1.bits }
task_s2.valid := s1_fire
when(s1_fire) { task_s2.bits := task_s1.bits }

val sameSet_s2 = task_s2.valid && task_s2.bits.fromA && !task_s2.bits.mshrTask && task_s2.bits.set === A_task.set
val sameSet_s3 = RegNext(task_s2.valid && task_s2.bits.fromA && !task_s2.bits.mshrTask) &&
RegEnable(task_s2.bits.set, task_s2.valid) === A_task.set
val sameSetCnt = PopCount(VecInit(io.msInfo.map(s => s.valid && s.bits.set === A_task.set && s.bits.fromA) :+
sameSet_s2 :+ sameSet_s3).asUInt)
noFreeWay := sameSetCnt >= cacheParams.ways.U

io.taskToPipe_s2 := task_s2

@@ -172,37 +211,79 @@ class RequestArb(implicit p: Parameters) extends L2Module {
// For GrantData, read refillBuffer
// Caution: GrantData-alias may read DataStorage or ReleaseBuf instead
// Release-replTask also read refillBuf and then write to DS
val releaseRefillData = task_s2.bits.replTask && (if (enableCHI) {
task_s2.bits.toTXREQ && (
task_s2.bits.chiOpcode.get === REQOpcodes.WriteBackFull ||
task_s2.bits.chiOpcode.get === REQOpcodes.Evict
)
} else {
task_s2.bits.opcode(2, 1) === Release(2, 1)
})
io.refillBufRead_s2.valid := mshrTask_s2 && (
task_s2.bits.fromB && task_s2.bits.opcode(2, 1) === ProbeAck(2, 1) && task_s2.bits.replTask ||
task_s2.bits.opcode(2, 1) === Release(2, 1) && task_s2.bits.replTask ||
task_s2.bits.fromB && task_s2.bits.opcode(2, 1) === ProbeAck(2, 1) && task_s2.bits.replTask || // ???
releaseRefillData ||
mshrTask_s2_a_upwards && !task_s2.bits.useProbeData)
io.refillBufRead_s2.bits.id := task_s2.bits.mshrId

// ReleaseData and ProbeAckData read releaseBuffer
// channel is used to differentiate GrantData and ProbeAckData
io.releaseBufRead_s2.valid := mshrTask_s2 && (
task_s2.bits.opcode === ReleaseData ||
task_s2.bits.fromB && task_s2.bits.opcode === ProbeAckData ||
mshrTask_s2_a_upwards && task_s2.bits.useProbeData)
io.releaseBufRead_s2.bits.id := task_s2.bits.mshrId
val snoopNeedData = if (enableCHI) {
task_s2.bits.fromB && task_s2.bits.toTXDAT && DATOpcodes.isSnpRespDataX(task_s2.bits.chiOpcode.get)
} else {
task_s2.bits.fromB && task_s2.bits.opcode === ProbeAckData
}
val releaseNeedData = if (enableCHI) {
task_s2.bits.toTXDAT && task_s2.bits.chiOpcode.get === DATOpcodes.CopyBackWrData
} else task_s2.bits.opcode === ReleaseData
val dctNeedData = if (enableCHI) {
task_s2.bits.toTXDAT && task_s2.bits.chiOpcode.get === DATOpcodes.CompData
} else false.B
val snpHitReleaseNeedData = if (enableCHI) {
!mshrTask_s2 && task_s2.bits.fromB && task_s2.bits.snpHitReleaseWithData
} else false.B
io.releaseBufRead_s2.valid := Mux(
mshrTask_s2,
releaseNeedData ||
snoopNeedData ||
dctNeedData ||
mshrTask_s2_a_upwards && task_s2.bits.useProbeData,
snpHitReleaseNeedData
)
io.releaseBufRead_s2.bits.id := Mux(
task_s2.bits.snpHitRelease,
task_s2.bits.snpHitReleaseIdx,
task_s2.bits.mshrId
)

require(beatSize == 2)

/* status of each pipeline stage */
io.status_s1.sets := VecInit(Seq(C_task.set, B_task.set, io.ASet, mshr_task_s1.bits.set))
io.status_s1.tags := VecInit(Seq(C_task.tag, B_task.tag, io.ATag, mshr_task_s1.bits.tag))
// io.status_s1.isKeyword := VecInit(Seq(C_task.isKeyword, B_task.isKeyword, io.isKeyword, mshr_task_s1.bits.isKeyword))

require(io.status_vec.size == 2)
io.status_vec.zip(Seq(task_s1, task_s2)).foreach {
case (status, task) =>
status.valid := task.valid
status.bits.channel := task.bits.channel
}

if (enableCHI) {
require(io.status_vec_toTX.get.size == 2)
io.status_vec_toTX.get.zip(Seq(task_s1, task_s2)).foreach {
case (status, task) =>
status.valid := task.valid
status.bits.channel := task.bits.channel
status.bits.txChannel := task.bits.txChannel
status.bits.mshrTask := task.bits.mshrTask
}
}

dontTouch(io)

// Performance counters
XSPerfAccumulate(cacheParams, "mshr_req", mshr_task_s0.valid)
XSPerfAccumulate(cacheParams, "mshr_req", s0_fire)
XSPerfAccumulate(cacheParams, "mshr_req_stall", io.mshrTask.valid && !io.mshrTask.ready)

XSPerfAccumulate(cacheParams, "sinkA_req", io.sinkA.fire)
28 changes: 23 additions & 5 deletions src/main/scala/coupledL2/RequestBuffer.scala
Original file line number Diff line number Diff line change
@@ -1,10 +1,28 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2

import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.tilelink.TLMessages._
import freechips.rocketchip.tilelink.TLPermissions._
import chisel3._
import chisel3.util._
import coupledL2._
import coupledL2.utils._
import utility._

@@ -141,12 +159,12 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete
e.valid && sameAddr(in, e.task)
)
).asUInt
val dup = io.in.valid && isPrefetch && dupMask.orR
val dup = isPrefetch && dupMask.orR

//!! TODO: we can also remove those that duplicate with mainPipe

/* ======== Alloc ======== */
io.in.ready := !full || doFlow || mergeA
io.in.ready := !full || doFlow || mergeA || dup

val insertIdx = PriorityEncoder(buffer.map(!_.valid))
val alloc = !full && io.in.valid && !doFlow && !dup && !mergeA
@@ -208,7 +226,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete
// so when waitMP(1) is 0 and waitMP(0) is 1, desired cycleCnt reached
// we recalculate waitMS and occWays, overriding old mask
// to take new allocated MSHR into account
e.waitMP := e.waitMP >> 1.U
e.waitMP := e.waitMP >> 1
when(e.waitMP(1) === 0.U && e.waitMP(0) === 1.U) {
waitMSUpdate := conflictMask(e.task)
}
@@ -223,7 +241,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete
val s1B_Block = io.s1Entrance.valid && io.s1Entrance.bits.set === e.task.set
val s1_Block = s1A_Block || s1B_Block
when(s1_Block) {
e.waitMP := e.waitMP | "b0100".U // fired-req at s2 next cycle
e.waitMP := (e.waitMP >> 1) | "b0100".U // fired-req at s2 next cycle
}

// update info
@@ -252,7 +270,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete

// add XSPerf to see how many cycles the req is held in Buffer
if(cacheParams.enablePerf) {
XSPerfAccumulate(cacheParams, "drop_prefetch", dup)
XSPerfAccumulate(cacheParams, "drop_prefetch", io.in.valid && dup)
if(flow){
XSPerfAccumulate(cacheParams, "req_buffer_flow", io.in.valid && doFlow)
}
5 changes: 4 additions & 1 deletion src/main/scala/coupledL2/SinkA.scala
Original file line number Diff line number Diff line change
@@ -38,7 +38,9 @@ class SinkA(implicit p: Parameters) extends L2Module {

def fromTLAtoTaskBundle(a: TLBundleA): TaskBundle = {
val task = Wire(new TaskBundle)
task := 0.U.asTypeOf(new TaskBundle)
task.channel := "b001".U
task.txChannel := 0.U
task.tag := parseAddress(a.address)._1
task.set := parseAddress(a.address)._2
task.off := parseAddress(a.address)._3
@@ -67,14 +69,15 @@ class SinkA(implicit p: Parameters) extends L2Module {
task.replTask := false.B
task.vaddr.foreach(_ := a.user.lift(VaddrKey).getOrElse(0.U))
//miss acquire keyword
task.isKeyword.foreach(_ := a.echo.lift(IsKeywordKey).getOrElse(false.B))
task.isKeyword.foreach(_ := a.echo.lift(IsKeywordKey).getOrElse(false.B))
task.mergeA := false.B
task.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle)
task
}
def fromPrefetchReqtoTaskBundle(req: PrefetchReq): TaskBundle = {
val task = Wire(new TaskBundle)
val fullAddr = Cat(req.tag, req.set, 0.U(offsetBits.W))
task := 0.U.asTypeOf(new TaskBundle)
task.channel := "b001".U
task.tag := parseAddress(fullAddr)._1
task.set := parseAddress(fullAddr)._2
5 changes: 5 additions & 0 deletions src/main/scala/coupledL2/SinkC.scala
Original file line number Diff line number Diff line change
@@ -67,7 +67,9 @@ class SinkC(implicit p: Parameters) extends L2Module {

def toTaskBundle(c: TLBundleC): TaskBundle = {
val task = Wire(new TaskBundle)
task := 0.U.asTypeOf(new TaskBundle)
task.channel := "b100".U
task.txChannel := 0.U
task.tag := parseAddress(c.address)._1
task.set := parseAddress(c.address)._2
task.off := parseAddress(c.address)._3
@@ -145,6 +147,7 @@ class SinkC(implicit p: Parameters) extends L2Module {
io.resp.mshrId := 0.U // DontCare
io.resp.tag := parseAddress(io.c.bits.address)._1
io.resp.set := parseAddress(io.c.bits.address)._2
io.resp.respInfo := 0.U.asTypeOf(io.resp.respInfo.cloneType)
io.resp.respInfo.opcode := io.c.bits.opcode
io.resp.respInfo.param := io.c.bits.param
io.resp.respInfo.last := last
@@ -158,6 +161,7 @@ class SinkC(implicit p: Parameters) extends L2Module {
io.releaseBufWrite.valid := io.c.valid && io.c.bits.opcode === ProbeAckData && last
io.releaseBufWrite.bits.id := 0.U(mshrBits.W) // id is given by MSHRCtl by comparing address to the MSHRs
io.releaseBufWrite.bits.data.data := Cat(io.c.bits.data, probeAckDataBuf)
io.releaseBufWrite.bits.beatMask := Fill(beatSize, true.B)

// C-Release, with new data, comes before repl-Release writes old refill data back to DS
val newdataMask = VecInit(io.msInfo.map(s =>
@@ -174,6 +178,7 @@ class SinkC(implicit p: Parameters) extends L2Module {
io.refillBufWrite.valid := RegNext(io.task.fire && io.task.bits.opcode === ReleaseData && newdataMask.orR, false.B)
io.refillBufWrite.bits.id := RegNext(OHToUInt(newdataMask))
io.refillBufWrite.bits.data.data := dataBuf(RegNext(io.task.bits.bufIdx)).asUInt
io.refillBufWrite.bits.beatMask := Fill(beatSize, true.B)

io.c.ready := !isRelease || !first || !full

4 changes: 3 additions & 1 deletion src/main/scala/coupledL2/TopDownMonitor.scala
Original file line number Diff line number Diff line change
@@ -21,8 +21,10 @@ import chisel3._
import chisel3.util._
import coupledL2.prefetch.PfSource
import coupledL2.utils._
import coupledL2.tl2tl.MSHRStatus
import utility.MemReqSource

// TODO: Accommodate CHI
class TopDownMonitor()(implicit p: Parameters) extends L2Module {
val banks = 1 << bankBits
val io = IO(new Bundle() {
@@ -53,7 +55,7 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
}

io.debugTopDown.l2MissMatch := Cat(addrMatchVec.flatten).orR
XSPerfAccumulate(cacheParams, s"${cacheParams.name}MissMatch_${cacheParams.hartId}", io.debugTopDown.l2MissMatch)
XSPerfAccumulate(cacheParams, s"${cacheParams.name}MissMatch", io.debugTopDown.l2MissMatch)

/* ====== PART TWO ======
* Count the parallel misses, and divide them into CPU/Prefetch
4 changes: 2 additions & 2 deletions src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala
Original file line number Diff line number Diff line change
@@ -27,11 +27,11 @@ import utility.{MemReqSource, Pipeline}
// TODO: PrefetchReceiver is temporarily used since L1&L2 do not support Hint.
// TODO: Delete this after Hint is accomplished.

case class PrefetchReceiverParams(n: Int = 32) extends PrefetchParameters {
case class PrefetchReceiverParams(n: Int = 32, tp: Boolean = true) extends PrefetchParameters {
override val hasPrefetchBit: Boolean = true
override val hasPrefetchSrc: Boolean = true
override val inflightEntries: Int = n
val hasTPPrefetcher: Boolean = true
val hasTPPrefetcher: Boolean = tp
}

class PrefetchReceiver()(implicit p: Parameters) extends PrefetchModule {
92 changes: 24 additions & 68 deletions src/main/scala/coupledL2/prefetch/Prefetcher.scala
Original file line number Diff line number Diff line change
@@ -291,9 +291,15 @@ class Prefetcher(implicit p: Parameters) extends PrefetchModule {
)
)))
})))
val tp = Module(new TemporalPrefetch()(p.alterPartial({
case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(TPParameters()))
})))
val tp = prefetchOpt match {
case Some(param: PrefetchReceiverParams) =>
if (param.hasTPPrefetcher) {
Some(Module(new TemporalPrefetch()(p.alterPartial({
case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(TPParameters()))
}))))
} else None
case _ => None
}
val pftQueue = Module(new PrefetchQueue)
val pipe = Module(new Pipeline(io.req.bits.cloneType, 1))
val l2_pf_en = RegNextN(io_l2_pf_en, 2, Some(true.B))
@@ -325,93 +331,43 @@ class Prefetcher(implicit p: Parameters) extends PrefetchModule {
pbop.io.train.valid := io.train.valid && (io.train.bits.reqsource =/= MemReqSource.L1DataPrefetch.id.U)
pbop.io.resp <> io.resp
pbop.io.resp.valid := io.resp.valid && io.resp.bits.isPBOP
tp.io.train <> io.train
tp.io.resp <> io.resp
tp.io.hartid := hartId
tp.foreach(_.io.train <> io.train)
tp.foreach(_.io.resp <> io.resp)
tp.foreach(_.io.hartid := hartId)

pfRcv.io.req.ready := true.B
vbop.io.req.ready := true.B
pbop.io.req.ready := true.B
tp.io.req.ready := !pfRcv.io.req.valid && !vbop.io.req.valid
tp.foreach(_.io.req.ready := !pfRcv.io.req.valid && !vbop.io.req.valid)
pipe.io.in <> pftQueue.io.deq
io.req <> pipe.io.out

// tpmeta interface
tp.io.tpmeta_port <> tpio.tpmeta_port.get
tp.foreach(_.io.tpmeta_port <> tpio.tpmeta_port.get)

/* pri vbop */
pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || tp.io.req.valid))
pftQueue.io.enq.valid := pfRcv.io.req.valid ||
(l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || (if (tp.isDefined) tp.get.io.req.valid else false.B)))
pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
pfRcv.io.req.valid -> pfRcv.io.req.bits,
vbop.io.req.valid -> vbop.io.req.bits,
pbop.io.req.valid -> pbop.io.req.bits,
tp.io.req.valid -> tp.io.req.bits
if (tp.isDefined) { tp.get.io.req.valid -> tp.get.io.req.bits }
else { false.B -> DontCare }
))
XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid)
XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid)
XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
if (tp.isDefined)
XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.get.io.req.valid)
XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && vbop.io.req.valid)
XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && pbop.io.req.valid)
XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.io.req.valid)
if (tp.isDefined)
XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.get.io.req.valid)
XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))

/* pri pbop */
// pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || tp.io.req.valid))
// pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
// pfRcv.io.req.valid -> pfRcv.io.req.bits,
// pbop.io.req.valid -> pbop.io.req.bits,
// vbop.io.req.valid -> vbop.io.req.bits,
// tp.io.req.valid -> tp.io.req.bits
// ))
// XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && pbop.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && !pbop.io.req.valid && vbop.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
// pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))

/* solo vbop */
// vbop.io.pbopCrossPage := true.B
// pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
// pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
// pfRcv.io.req.valid -> pfRcv.io.req.bits,
// vbop.io.req.valid -> vbop.io.req.bits,
// tp.io.req.valid -> tp.io.req.bits
// ))
// XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && vbop.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && tp.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
// pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))

/* solo pbop */
// vbop.io.train.valid := false.B
// vbop.io.resp.valid := false.B
// pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (pbop.io.req.valid || tp.io.req.valid))
// pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
// pfRcv.io.req.valid -> pfRcv.io.req.bits,
// pbop.io.req.valid -> pbop.io.req.bits,
// tp.io.req.valid -> tp.io.req.bits
// ))
// XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && pbop.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !pbop.io.req.valid && tp.io.req.valid)
// XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
// pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || (if (tp.isDefined) tp.get.io.req.valid else false.B)))

case _ => assert(cond = false, "Unknown prefetcher")
}
}
}
74 changes: 74 additions & 0 deletions src/main/scala/coupledL2/tl2chi/Bundle.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.tilelink.TLPermissions._
import utility.MemReqSource
import coupledL2.{HasTLChannelBits, DirResult, PipeStatus}

object CHIChannel {
def TXREQ = "b001".U
def TXRSP = "b010".U
def TXDAT = "b100".U
}

trait HasCHIChannelBits { this: Bundle =>
val txChannel = UInt(3.W)
def toTXREQ = txChannel(0).asBool
def toTXRSP = txChannel(1).asBool
def toTXDAT = txChannel(2).asBool
}

class PipeStatusWithCHI(implicit p: Parameters) extends PipeStatus
with HasCHIChannelBits {
val mshrTask = Bool()
}

class MSHRStatus(implicit p: Parameters) extends TL2CHIL2Bundle
with HasTLChannelBits
with HasCHIChannelBits {
// TODO
val set = UInt(setBits.W)
val reqTag = UInt(tagBits.W)
val metaTag = UInt(tagBits.W)
val needsRepl = Bool()
val w_c_resp = Bool()
val w_d_resp = Bool()
val will_free = Bool()

// val way = UInt(wayBits.W)
// val off = UInt(offsetBits.W)
// val opcode = UInt(3.W)
// val param = UInt(3.W)
// val size = UInt(msgSizeBits.W)
// val source = UInt(sourceIdBits.W)
// val alias = aliasBitsOpt.map(_ => UInt(aliasBitsOpt.get.W))
// val aliasTask = aliasBitsOpt.map(_ => Bool())
// val needProbeAckData = Bool() // only for B reqs
// val fromL2pft = prefetchOpt.map(_ => Bool())
// val needHint = prefetchOpt.map(_ => Bool())

// for TopDown usage
val reqSource = UInt(MemReqSource.reqSourceBits.W)
val is_miss = Bool()
val is_prefetch = Bool()

}
327 changes: 327 additions & 0 deletions src/main/scala/coupledL2/tl2chi/MMIOBridge.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@

/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import utility._
import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.diplomacy._
import freechips.rocketchip.tilelink._
import freechips.rocketchip.tilelink.TLMessages._
import coupledL2.HasCoupledL2Parameters
import coupledL2.tl2chi.CHIOpcode._

class MMIOBridge()(implicit p: Parameters) extends LazyModule
with HasCoupledL2Parameters
with HasCHIMsgParameters {

override def shouldBeInlined: Boolean = false

/**
* MMIO node
*/
val onChipPeripheralRange = AddressSet(0x38000000L, 0x07ffffffL)
val uartRange = AddressSet(0x40600000, 0xf)
val uartDevice = new SimpleDevice("serial", Seq("xilinx,uartlite"))
val uartParams = TLSlaveParameters.v1(
address = Seq(uartRange),
resources = uartDevice.reg,
regionType = RegionType.UNCACHED,
supportsGet = TransferSizes(1, 8),
supportsPutFull = TransferSizes(1, 8),
supportsPutPartial = TransferSizes(1, 8)
)
val peripheralRange = AddressSet(
0x0, 0x7fffffff
).subtract(onChipPeripheralRange).flatMap(x => x.subtract(uartRange))

val mmioNode = TLManagerNode(Seq(TLSlavePortParameters.v1(
managers = Seq(TLSlaveParameters.v1(
address = peripheralRange,
regionType = RegionType.UNCACHED,
supportsGet = TransferSizes(1, 8),
supportsPutFull = TransferSizes(1, 8),
supportsPutPartial = TransferSizes(1, 8)
), uartParams),
beatBytes = 8
)))

lazy val module = new MMIOBridgeImp(this)

}

class MMIOBridgeEntry(edge: TLEdgeIn)(implicit p: Parameters) extends TL2CHIL2Module {

val needRR = true
val order = WireInit(if (needRR) OrderEncodings.EndpointOrder else OrderEncodings.None)

val io = IO(new Bundle() {
val req = Flipped(DecoupledIO(new TLBundleA(edge.bundle)))
val resp = DecoupledIO(new TLBundleD(edge.bundle))
val chi = new DecoupledNoSnpPortIO
val id = Input(UInt())
val pCrdQuery = Output(ValidIO(new Bundle() {
val pCrdType = UInt(PCRDTYPE_WIDTH.W)
}))
val pCrdGrant = Input(Bool())
val waitOnReadReceipt = Option.when(needRR)(Output(Bool()))
})

val s_txreq = RegInit(true.B)
val s_ncbwrdata = RegInit(true.B)
// val s_readrecript = RegInit(true.B) // TODO
// val s_compack = RegInit(true.B) // TODO
val s_resp = RegInit(true.B)
val w_comp = RegInit(true.B)
val w_dbidresp = RegInit(true.B)
val w_compdata = RegInit(true.B)
val w_pcrdgrant = RegInit(true.B)
val w_readreceipt = Option.when(needRR)(RegInit(true.B))

val no_schedule = s_txreq && s_ncbwrdata && s_resp
val no_wait = w_comp && w_dbidresp && w_compdata && w_pcrdgrant && w_readreceipt.getOrElse(true.B)

val req = RegEnable(io.req.bits, io.req.fire)
val req_valid = !no_schedule || !no_wait
val rdata = Reg(UInt(DATA_WIDTH.W))
val srcID = Reg(UInt(SRCID_WIDTH.W))
val dbID = Reg(UInt(DBID_WIDTH.W))
val allowRetry = RegInit(true.B)
val pCrdType = Reg(UInt(PCRDTYPE_WIDTH.W))
val isRead = req.opcode === Get

val wordBits = io.req.bits.data.getWidth // 64
val wordBytes = wordBits / 8
val words = DATA_WIDTH / wordBits
val wordIdxBits = log2Ceil(words)
require(wordBits == 64)
require(wordIdxBits == 2)
val reqWordIdx = (req.address >> log2Ceil(wordBytes))(wordIdxBits - 1, 0)

val txreq = io.chi.tx.req
val txdat = io.chi.tx.dat
val rxdat = io.chi.rx.dat
val rxrsp = io.chi.rx.rsp

/**
* Entry allocation
*/
when (io.req.fire) {
s_txreq := false.B
s_resp := false.B
allowRetry := true.B
when (io.req.bits.opcode === Get) {
w_compdata := false.B
w_readreceipt.foreach(_ := false.B)
}.elsewhen (io.req.bits.opcode === PutFullData || io.req.bits.opcode === PutPartialData) {
w_comp := false.B
w_dbidresp := false.B
s_ncbwrdata := false.B
}
}

/**
* State flags recover
*/
when (txreq.fire) {
s_txreq := true.B
}
when (rxdat.fire) {
w_compdata := true.B
rdata := rxdat.bits.data
}
when (io.resp.fire) {
s_resp := true.B
}
when (rxrsp.fire) {
when (rxrsp.bits.opcode === RSPOpcodes.CompDBIDResp || rxrsp.bits.opcode === RSPOpcodes.Comp) {
w_comp := true.B
}
when (rxrsp.bits.opcode === RSPOpcodes.CompDBIDResp || rxrsp.bits.opcode === RSPOpcodes.DBIDResp) {
w_dbidresp := true.B
srcID := rxrsp.bits.srcID
dbID := rxrsp.bits.dbID
}
when (rxrsp.bits.opcode === RSPOpcodes.RetryAck) {
s_txreq := false.B
w_pcrdgrant := false.B
allowRetry := false.B
pCrdType := rxrsp.bits.pCrdType
}
when (rxrsp.bits.opcode === RSPOpcodes.ReadReceipt) {
w_readreceipt.foreach(_ := true.B)
}
}
when (txdat.fire) {
s_ncbwrdata := true.B
}
when (io.pCrdGrant) {
w_pcrdgrant := true.B
}

/**
* IO Assignment
*/
io.req.ready := no_schedule && no_wait
txreq.valid := !s_txreq && w_pcrdgrant
txreq.bits := 0.U.asTypeOf(txreq.bits.cloneType)
txreq.bits.tgtID := SAM(sam).lookup(txreq.bits.addr)
txreq.bits.txnID := io.id
txreq.bits.opcode := ParallelLookUp(req.opcode, Seq(
Get -> REQOpcodes.ReadNoSnp,
PutFullData -> REQOpcodes.WriteNoSnpFull,
PutPartialData -> REQOpcodes.WriteNoSnpPtl
))
txreq.bits.size := req.size
txreq.bits.addr := req.address
txreq.bits.allowRetry := allowRetry
txreq.bits.order := order
txreq.bits.pCrdType := Mux(allowRetry, 0.U, pCrdType)
txreq.bits.memAttr := MemAttr(allocate = false.B, cacheable = false.B, device = true.B, ewa = false.B)
txreq.bits.expCompAck := false.B

io.resp.valid := !s_resp && Mux(isRead, w_compdata, w_comp && w_dbidresp && s_ncbwrdata)
io.resp.bits.opcode := Mux(isRead, AccessAckData, AccessAck)
io.resp.bits.param := 0.U // reserved
io.resp.bits.size := req.size
io.resp.bits.source := req.source
io.resp.bits.sink := 0.U // ignored
io.resp.bits.denied := false.B
io.resp.bits.corrupt := false.B
io.resp.bits.data := ParallelLookUp(
reqWordIdx,
List.tabulate(words)(i => i.U -> rdata((i + 1) * wordBits - 1, i * wordBits))
)

txdat.valid := !s_ncbwrdata && w_dbidresp
txdat.bits := 0.U.asTypeOf(txdat.bits.cloneType)
txdat.bits.tgtID := srcID
txdat.bits.txnID := dbID
txdat.bits.opcode := DATOpcodes.NonCopyBackWrData
txdat.bits.ccID := Cat(req.address(log2Ceil(beatBytes)), 0.U(1.W))
txdat.bits.dataID := Cat(req.address(log2Ceil(beatBytes)), 0.U(1.W))
txdat.bits.be := ParallelLookUp(
reqWordIdx,
List.tabulate(words)(i => i.U -> (ZeroExt(req.mask, BE_WIDTH) << (i * wordBytes)))
)
txdat.bits.data := Fill(words, req.data) & FillInterleaved(8, txdat.bits.be)

rxrsp.ready := (!w_comp || !w_dbidresp || !w_readreceipt.getOrElse(true.B)) && s_txreq
rxdat.ready := !w_compdata && s_txreq

io.pCrdQuery.valid := !w_pcrdgrant
io.pCrdQuery.bits.pCrdType := pCrdType

io.waitOnReadReceipt.foreach(_ := !w_readreceipt.get && (s_txreq || !allowRetry))
}

class MMIOBridgeImp(outer: MMIOBridge) extends LazyModuleImp(outer)
with HasCoupledL2Parameters
with HasCHIMsgParameters {

val (bus, edge) = outer.mmioNode.in.head

val io = IO(new DecoupledNoSnpPortIO)

val entries = Seq.fill(mmioBridgeSize) { Module(new MMIOBridgeEntry(edge)) }
val readys = VecInit(entries.map(_.io.req.ready))
val selectOH = ParallelPriorityMux(readys.zipWithIndex.map { case (ready, i) =>
ready -> (1 << i).U
}).asBools

/**
* When a ReadNoSnp requires RequestOrder or Endpoint Order, the requester requires a ReadReceipt to determine
* when it can send the next ordered request.
*/
val waitOnReadReceiptVec = entries.map(e => e.io.waitOnReadReceipt.getOrElse(false.B))
val waitOnReadReceipt = Cat(waitOnReadReceiptVec).orR

/**
* Protocol Retry
*/
val pCrdValids = RegInit(VecInit(Seq.fill(mmioBridgeSize)(false.B)))
val pCrdTypes = Reg(Vec(mmioBridgeSize, UInt(PCRDTYPE_WIDTH.W)))
val pCrdInsertOH = PriorityEncoderOH(pCrdValids.map(!_))
val isPCrdGrant = io.rx.rsp.bits.opcode === RSPOpcodes.PCrdGrant
val pCrdMatch = Wire(Vec(mmioBridgeSize, Vec(mmioBridgeSize, Bool())))
val pCrdMatchEntryVec = pCrdMatch.map(_.asUInt.orR)
val pCrdMatchEntryOH = PriorityEncoderOH(pCrdMatchEntryVec)
val pCrdFreeOH = ParallelPriorityMux(
pCrdMatchEntryVec,
pCrdMatch.map(x => VecInit(PriorityEncoderOH(x)))
)

when (io.rx.rsp.valid && isPCrdGrant) {
pCrdValids.zip(pCrdInsertOH).foreach { case (v, insert) =>
when (insert) { v := true.B }
assert(!(v && insert), "P-Credit overflow")
}
pCrdTypes.zip(pCrdInsertOH).foreach { case (t, insert) =>
when (insert) { t := io.rx.rsp.bits.pCrdType }
}
}
pCrdFreeOH.zip(pCrdValids).foreach { case (free, v) =>
when (free) { v := false.B }
}

entries.zipWithIndex.foreach { case (entry, i) =>
entry.io.req.valid := bus.a.valid && selectOH(i)
entry.io.req.bits := bus.a.bits

entry.io.chi.rx.dat.valid := io.rx.dat.valid && io.rx.dat.bits.txnID === i.U
entry.io.chi.rx.dat.bits := io.rx.dat.bits

entry.io.chi.rx.rsp.valid := io.rx.rsp.valid && io.rx.rsp.bits.txnID === i.U
entry.io.chi.rx.rsp.bits := io.rx.rsp.bits

entry.io.id := i.U

pCrdMatch(i) := VecInit(pCrdValids.zip(pCrdTypes).map { case (v, t) =>
entry.io.pCrdQuery.valid && v &&
entry.io.pCrdQuery.bits.pCrdType === t
})
entry.io.pCrdGrant := pCrdMatchEntryOH(i)
}

val txreqArb = Module(new Arbiter(chiselTypeOf(io.tx.req.bits), mmioBridgeSize))
for ((a, req) <- txreqArb.io.in.zip(entries.map(_.io.chi.tx.req))) {
a <> req
val isReadNoSnp = req.bits.opcode === REQOpcodes.ReadNoSnp
val block = isReadNoSnp && waitOnReadReceipt
req.ready := a.ready && !block
a.valid := req.valid && !block
}
io.tx.req <> txreqArb.io.out
// arb(entries.map(_.io.chi.tx.req), io.tx.req, Some("mmio_txreq"))
arb(entries.map(_.io.chi.tx.dat), io.tx.dat, Some("mmio_txdat"))
arb(entries.map(_.io.resp), bus.d, Some("mmio_channel_D"))

bus.a.ready := Cat(readys).orR

io.rx.dat.ready := Cat(entries.zipWithIndex.map { case (entry, i) =>
entry.io.chi.rx.dat.ready && io.rx.dat.bits.txnID === i.U
}).orR
io.rx.rsp.ready := Cat(entries.zipWithIndex.map { case (entry, i) =>
entry.io.chi.rx.rsp.ready && io.rx.rsp.bits.txnID === i.U
}).orR || isPCrdGrant

dontTouch(io)
dontTouch(bus)
}
1,037 changes: 1,037 additions & 0 deletions src/main/scala/coupledL2/tl2chi/MSHR.scala

Large diffs are not rendered by default.

308 changes: 308 additions & 0 deletions src/main/scala/coupledL2/tl2chi/MSHRCtl.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import chisel3.util.random.LFSR
import utility._
import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.tilelink._
import freechips.rocketchip.tilelink.TLMessages._
import coupledL2.prefetch.PrefetchTrain
import coupledL2.utils.{XSPerfAccumulate, XSPerfHistogram, XSPerfMax}
import coupledL2._
import tl2chi.{HasCHIMsgParameters}
import coupledL2.tl2chi.CHIOpcode.RSPOpcodes._

// PCrd info for MSHR Retry
class PCrdInfo(implicit p: Parameters) extends TL2CHIL2Bundle
{
val valid = Bool()
val srcID = chiOpt.map(_ => UInt(SRCID_WIDTH.W))
val pCrdType = chiOpt.map(_ => UInt(PCRDTYPE_WIDTH.W))
}

class MSHRCtl(implicit p: Parameters) extends TL2CHIL2Module {
val io = IO(new Bundle() {
/* interact with req arb */
val fromReqArb = Input(new Bundle() {
val status_s1 = new PipeEntranceStatus()
})
val toReqArb = Output(new BlockInfo())

/* interact with mainpipe */
val fromMainPipe = new Bundle() {
val mshr_alloc_s3 = Flipped(ValidIO(new MSHRRequest()))
}
val toMainPipe = new Bundle() {
val mshr_alloc_ptr = Output(UInt(mshrBits.W))
}

/* to request arbiter */
// val mshrFull = Output(Bool())
val mshrTask = DecoupledIO(new TaskBundle())

/* status of s2 and s3 */
val pipeStatusVec = Flipped(Vec(2, ValidIO(new PipeStatus)))

/* send reqs */
val toTXREQ = DecoupledIO(new CHIREQ())
val toTXRSP = DecoupledIO(new CHIRSP()) // TODO: unify with main pipe, which should be TaskBundle
val toSourceB = DecoupledIO(new TLBundleB(edgeIn.bundle))

/* to block sourceB from sending same-addr probe until GrantAck received */
val grantStatus = Input(Vec(grantBufInflightSize, new GrantStatus()))

/* receive resps */
val resps = Input(new Bundle() {
val sinkC = new RespBundle() //probeAck from core
val rxrsp = new RespBundle() //releaseAck(CompDBID) from CHI
val rxdat = new RespBundle() //AcquireBlock(CompData) from CHI
})

val releaseBufWriteId = Output(UInt(mshrBits.W))

/* nested writeback */
val nestedwb = Input(new NestedWriteback)
val nestedwbDataId = Output(ValidIO(UInt(mshrBits.W)))

/* MSHR info to Sinks */
val msInfo = Vec(mshrsAll, ValidIO(new MSHRInfo()))
val aMergeTask = Flipped(ValidIO(new AMergeTask))

/* refill read replacer result */
val replResp = Flipped(ValidIO(new ReplacerResult))

/* for TopDown Monitor */
val msStatus = topDownOpt.map(_ => Vec(mshrsAll, ValidIO(new MSHRStatus)))

/* to Slice Top for pCrd info.*/
val waitPCrdInfo = Output(Vec(mshrsAll, new PCrdInfo))
})

/*MSHR allocation pointer gen -> to Mainpipe*/
class MSHRSelector(implicit p: Parameters) extends L2Module {
val io = IO(new Bundle() {
val idle = Input(Vec(mshrsAll, Bool()))
val out = ValidIO(UInt(mshrsAll.W))
})
io.out.valid := ParallelOR(io.idle)
io.out.bits := ParallelPriorityMux(io.idle.zipWithIndex.map {
case (b, i) => (b, (1 << i).U)
})
}

val mshrs = Seq.fill(mshrsAll) { Module(new MSHR()) }
val mshrValids = VecInit(mshrs.map(m => m.io.status.valid))
val pipeReqCount = PopCount(Cat(io.pipeStatusVec.map(_.valid))) // TODO: consider add !mshrTask to optimize
val mshrCount = PopCount(Cat(mshrs.map(_.io.status.valid)))
val mshrFull = pipeReqCount + mshrCount >= mshrsAll.U
val a_mshrFull = pipeReqCount + mshrCount >= (mshrsAll-1).U // the last idle mshr should not be allocated for channel A req
val mshrSelector = Module(new MSHRSelector())
val selectedMSHROH = mshrSelector.io.out.bits

mshrSelector.io.idle := mshrs.map(m => !m.io.status.valid)
io.toMainPipe.mshr_alloc_ptr := OHToUInt(selectedMSHROH)

/*
when PCrdGrant, give credit to one entry that:
1. got RetryAck and not Reissued
2. match srcID and PCrdType
3. use Round-Robin arbiter if multi-entry match
*/
val isPCrdGrant = io.resps.rxrsp.valid && (io.resps.rxrsp.respInfo.chiOpcode.get === PCrdGrant)
val waitPCrdInfo = Wire(Vec(mshrsAll, new PCrdInfo))
// val pArb = Module(new RRArbiter(UInt(), mshrsAll))

val matchPCrdGrant = VecInit(waitPCrdInfo.map(p =>
isPCrdGrant && p.valid &&
p.srcID.get === io.resps.rxrsp.respInfo.srcID.get &&
p.pCrdType.get === io.resps.rxrsp.respInfo.pCrdType.get
))

/* pArb.io.in.zipWithIndex.foreach {
case (in, i) =>
in.valid := matchPCrdGrant(i)
in.bits := 0.U
}
pArb.io.out.ready := true.B
val pCrdRR = VecInit(UIntToOH(pArb.io.chosen))
val pCrdPri = VecInit((matchPCrdGrant.asUInt & pCrdRR.asUInt).asBools)
//val pCrdPri = VecInit(PriorityEncoderOH(matchPCrdGrant))
val pCrdIsWait = OHToUInt(pCrdPri)
*/

/*
Random arbiter if multi-entry match
*/
val lfsr = LFSR(16, true.B)
val idx = Random(16, lfsr)
val idxOH = VecInit(UIntToOH(idx))

val doubleReq = Fill(2, matchPCrdGrant.asUInt)
val doubleGnt = ~(doubleReq - idxOH.asUInt) & doubleReq
val gnt = doubleGnt(31,16) | doubleGnt(15,0)
val pCrdPri = VecInit(gnt.asBools)
val pCrdIsWait = OHToUInt(pCrdPri)

/* when PCrdGrant come before RetryAck, 16 entry CAM used to:
1. save {srcID, PCrdType}
2. Broadcast to each MSHR for seaching when RetryAck
*/
// val pCamValids = RegInit(VecInit(Seq.fill(mshrsAll){ false.B }))
val pCam = RegInit(VecInit(Seq.fill(mshrsAll)(0.U.asTypeOf(new PCrdInfo))))
val pCamPri = Wire(UInt(5.W))
val pCamValids = Cat(pCam.map(_.valid))
val enqIdx = PriorityEncoder(~pCamValids.asUInt)

when (isPCrdGrant && !pCrdIsWait.orR){
pCam(enqIdx).valid := true.B
pCam(enqIdx).srcID.get := io.resps.rxrsp.respInfo.srcID.get
pCam(enqIdx).pCrdType.get := io.resps.rxrsp.respInfo.pCrdType.get
}

pCamPri := 16.U //out of range of mshrAll

//each entry zip pCam
for (i <- 0 until mshrsAll) { //entry
when (waitPCrdInfo(i).valid) {
for (j <- 0 until mshrsAll) { //pCam
when (pCam(j).valid &&
waitPCrdInfo(i).srcID.get === pCam(j).srcID.get &&
waitPCrdInfo(i).srcID.get === pCam(j).pCrdType.get) {
pCam(j).valid := false.B
pCamPri := i.U
}
}
}
}

/* SinkC(release) search MSHR with PA */
val resp_sinkC_match_vec = mshrs.map { mshr =>
val status = mshr.io.status.bits
val tag = Mux(status.needsRepl, status.metaTag, status.reqTag)
mshr.io.status.valid && status.w_c_resp && io.resps.sinkC.set === status.set && io.resps.sinkC.tag === tag
}

/* Port connection of MSHR entry */
mshrs.zipWithIndex.foreach {
case (m, i) =>
m.io.id := i.U
m.io.alloc.valid := selectedMSHROH(i) && io.fromMainPipe.mshr_alloc_s3.valid
m.io.alloc.bits := io.fromMainPipe.mshr_alloc_s3.bits
m.io.alloc.bits.task.isKeyword.foreach(_:= io.fromMainPipe.mshr_alloc_s3.bits.task.isKeyword.getOrElse(false.B))

m.io.resps.sinkC.valid := io.resps.sinkC.valid && resp_sinkC_match_vec(i)
m.io.resps.sinkC.bits := io.resps.sinkC.respInfo

m.io.resps.rxdat.valid := m.io.status.valid && io.resps.rxdat.valid && io.resps.rxdat.mshrId === i.U
m.io.resps.rxdat.bits := io.resps.rxdat.respInfo

m.io.resps.rxrsp.valid := (m.io.status.valid && io.resps.rxrsp.valid && !isPCrdGrant && io.resps.rxrsp.mshrId === i.U) || (isPCrdGrant && pCrdPri(i))
m.io.resps.rxrsp.bits := io.resps.rxrsp.respInfo

m.io.replResp.valid := io.replResp.valid && io.replResp.bits.mshrId === i.U
m.io.replResp.bits := io.replResp.bits

io.msInfo(i) := m.io.msInfo
m.io.nestedwb := io.nestedwb
m.io.aMergeTask.valid := io.aMergeTask.valid && io.aMergeTask.bits.id === i.U
m.io.aMergeTask.bits := io.aMergeTask.bits.task

waitPCrdInfo(i) := m.io.waitPCrdInfo
m.io.pCamPri := (pCamPri === i.U) && waitPCrdInfo(i).valid
}
/* Reserve 1 entry for SinkB */
io.waitPCrdInfo <> waitPCrdInfo

/* Reserve 1 entry for SinkB */
io.toReqArb.blockC_s1 := false.B
io.toReqArb.blockB_s1 := mshrFull // conflict logic in SinkB
io.toReqArb.blockA_s1 := a_mshrFull // conflict logic in ReqBuf
io.toReqArb.blockG_s1 := false.B

/* Acquire downwards to TXREQ*/
fastArb(mshrs.map(_.io.tasks.txreq), io.toTXREQ, Some("txreq"))

/* Response downwards to TXRSP*/
fastArb(mshrs.map(_.io.tasks.txrsp), io.toTXRSP, Some("txrsp"))

/* Probe upwards */
val sourceB = Module(new SourceB())
fastArb(mshrs.map(_.io.tasks.source_b), sourceB.io.task, Some("source_b"))
sourceB.io.grantStatus := io.grantStatus
io.toSourceB <> sourceB.io.sourceB

/* Arbitrate MSHR task to RequestArbiter */
fastArb(mshrs.map(_.io.tasks.mainpipe), io.mshrTask, Some("mshr_task"))

/* releaseBuf link to MSHR id */
io.releaseBufWriteId := ParallelPriorityMux(resp_sinkC_match_vec, (0 until mshrsAll).map(i => i.U))

/* Nest writeback check */
io.nestedwbDataId.valid := Cat(mshrs.map(_.io.nestedwbData)).orR
io.nestedwbDataId.bits := ParallelPriorityMux(mshrs.zipWithIndex.map {
case (mshr, i) => (mshr.io.nestedwbData, i.U)
})
assert(RegNext(PopCount(mshrs.map(_.io.nestedwbData)) <= 1.U), "should only be one nestedwbData")


/* Status for topDown monitor */
topDownOpt.foreach (_ =>
io.msStatus.get.zip(mshrs).foreach {
case (in, s) => in := s.io.status
}
)
/* Performance counters */
/* XSPerfAccumulate(cacheParams, "capacity_conflict_to_sinkA", a_mshrFull)
XSPerfAccumulate(cacheParams, "capacity_conflict_to_sinkB", mshrFull)
XSPerfHistogram(cacheParams, "mshr_alloc", io.toMainPipe.mshr_alloc_ptr,
enable = io.fromMainPipe.mshr_alloc_s3.valid,
start = 0, stop = mshrsAll, step = 1)
if (cacheParams.enablePerf) {
val start = 0
val stop = 100
val step = 5
val acquire_period = ParallelMux(mshrs.map { case m => m.io.resps.sink_d.valid -> m.acquire_period })
val release_period = ParallelMux(mshrs.map { case m => m.io.resps.sink_d.valid -> m.release_period })
val probe_period = ParallelMux(mshrs.map { case m => m.io.resps.sink_c.valid -> m.probe_period })
val acquire_period_en = io.resps.rxdat.valid &&
(io.resps.rxdat.respInfo.opcode === Grant || io.resps.rxdat.respInfo.opcode === GrantData)
val release_period_en = io.resps.rxdat.valid && io.resps.rxdat.respInfo.opcode === ReleaseAck
val probe_period_en = io.resps.sinkC.valid &&
(io.resps.sinkC.respInfo.opcode === ProbeAck || io.resps.sinkC.respInfo.opcode === ProbeAckData)
XSPerfHistogram(cacheParams, "acquire_period", acquire_period, acquire_period_en, start, stop, step)
XSPerfHistogram(cacheParams, "release_period", release_period, release_period_en, start, stop, step)
XSPerfHistogram(cacheParams, "probe_period", probe_period, probe_period_en, start, stop, step)
val timers = RegInit(VecInit(Seq.fill(mshrsAll)(0.U(64.W))))
for (((timer, m), i) <- timers.zip(mshrs).zipWithIndex) {
when (m.io.alloc.valid) {
timer := 1.U
}.otherwise {
timer := timer + 1.U
}
val enable = m.io.status.valid && m.io.status.bits.will_free
XSPerfHistogram(cacheParams, "mshr_latency_" + Integer.toString(i, 10),
timer, enable, 0, 300, 10)
XSPerfMax(cacheParams, "mshr_latency", timer, enable)
}
}*/
}

895 changes: 895 additions & 0 deletions src/main/scala/coupledL2/tl2chi/MainPipe.scala

Large diffs are not rendered by default.

67 changes: 67 additions & 0 deletions src/main/scala/coupledL2/tl2chi/RXDAT.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import utility._
import org.chipsalliance.cde.config.Parameters
import coupledL2.{RespBundle, MSHRBufWrite}

class RXDAT(implicit p: Parameters) extends TL2CHIL2Module {
val io = IO(new Bundle() {
val out = Flipped(DecoupledIO(new CHIDAT()))
val in = Output(new RespBundle())
val refillBufWrite = ValidIO(new MSHRBufWrite())
})

/* RXDAT for Transactions: CompData */

// TODO: parameterize this
// For bus width is 256-bit
val first = (io.out.bits.dataID === "b00".U)
val last = (io.out.bits.dataID === "b10".U)

/* Write Refill Buffer*/
io.refillBufWrite.valid := io.out.valid
io.refillBufWrite.bits.id := io.out.bits.txnID
io.refillBufWrite.bits.data.data := Fill(beatSize, io.out.bits.data)
io.refillBufWrite.bits.beatMask := Cat(last, first)

/* Response to MSHR */
io.in.valid := (first || last) && io.out.valid
io.in.mshrId := io.out.bits.txnID
io.in.set := 0.U(setBits.W)
io.in.tag := 0.U(tagBits.W)

io.in.respInfo.opcode := DontCare
io.in.respInfo.param := DontCare
io.in.respInfo.last := last
io.in.respInfo.dirty := DontCare
io.in.respInfo.isHit := DontCare
io.in.respInfo.chiOpcode.get := io.out.bits.opcode
io.in.respInfo.txnID.get := io.out.bits.txnID
io.in.respInfo.srcID.get := io.out.bits.srcID
io.in.respInfo.homeNID.get := io.out.bits.homeNID
io.in.respInfo.dbID.get := io.out.bits.dbID
io.in.respInfo.resp.get := io.out.bits.resp
io.in.respInfo.pCrdType.get := DontCare // RXDAT Channel does not have a pCrdType field

io.out.ready := true.B

}
53 changes: 53 additions & 0 deletions src/main/scala/coupledL2/tl2chi/RXRSP.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import utility._
import org.chipsalliance.cde.config.Parameters
import coupledL2.RespBundle

class RXRSP(implicit p: Parameters) extends TL2CHIL2Module {
val io = IO(new Bundle() {
val out = Flipped(DecoupledIO(new CHIRSP()))
val in = Output(new RespBundle())
})

/* RXRSP for Transactions:
1. Comp
2. CompDBIDResp
3. RetryAck
4. PCrdGrant
*/
io.in.valid := io.out.valid
io.in.mshrId := io.out.bits.txnID
io.in.set := 0.U(setBits.W)
io.in.tag := 0.U(tagBits.W)

io.in.respInfo := 0.U.asTypeOf(io.in.respInfo.cloneType)
io.in.respInfo.chiOpcode.get := io.out.bits.opcode
io.in.respInfo.txnID.get := io.out.bits.txnID
io.in.respInfo.srcID.get := io.out.bits.srcID
io.in.respInfo.dbID.get := io.out.bits.dbID
io.in.respInfo.pCrdType.get := io.out.bits.pCrdType
io.in.respInfo.last := true.B

io.out.ready := true.B

}
147 changes: 147 additions & 0 deletions src/main/scala/coupledL2/tl2chi/RXSNP.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import utility._
import org.chipsalliance.cde.config.Parameters
import scala.collection.View.Fill
import coupledL2.{TaskBundle, MSHRInfo, MetaEntry, MergeTaskBundle}
import coupledL2.MetaData._

class RXSNP(
lCreditNum: Int = 4 // the number of L-Credits that a receiver can provide
)(implicit p: Parameters) extends TL2CHIL2Module {
val io = IO(new Bundle() {
val rxsnp = Flipped(DecoupledIO(new CHISNP()))
val task = DecoupledIO(new TaskBundle())
val msInfo = Vec(mshrsAll, Flipped(ValidIO(new MSHRInfo())))
})

val task = Wire(new TaskBundle)

/**
* When should an MSHR with Acquire address of X block/nest an incoming snoop with address X?
*
* 1. Before MSHR receives the first beat of CompData, snoop should be **nested** because snoop has higher priority
* than request according to CHI spec.
* 2. After MSHR receives the first beat of CompData, and before L2 receives GrantAck from L1, snoop of X should be
* **blocked**, because a slave should not issue a Probe if there is a pending GrantAck on the block according
* to TileLink spec.
* 3. Before MSHR sends out WriteBackFull/Evict to write refilled data into DS, snoop should be **blocked**, Because
* the snooped block is still in RefillBuffer rather than DS.
* 4. After MSHR sends out WriteBackFull/Evict and write refilled data into DS, snoop should be **nested**, still
* because snoop has higher priority than request.
*/
val reqBlockSnpMask = VecInit(io.msInfo.map(s =>
s.valid && s.bits.set === task.set && s.bits.reqTag === task.tag &&
(s.bits.w_grantfirst || s.bits.aliasTask.getOrElse(false.B) && !s.bits.w_rprobeacklast) &&
(s.bits.blockRefill || s.bits.w_releaseack) && !s.bits.willFree
)).asUInt
val reqBlockSnp = reqBlockSnpMask.orR

/**
* When should an MSHR that is goint to replace cacheline Y block/nest an incoming snoop with address Y?
*
* 1. After MSHR decides which way to replace but before MSHR finished all the rProbes, the incoming snoop of Y
* should be **blocked**, because Once the Probe is issued the slave should not issue further Probes on the block
* until it receives a ProbeAck.
* 2. After MSHR receives all the ProbeAcks of rProbe, the snoop of Y should be nested.
*/
val replaceBlockSnpMask = VecInit(io.msInfo.map(s =>
s.valid && s.bits.set === task.set && s.bits.metaTag === task.tag && !s.bits.dirHit && isValid(s.bits.metaState) &&
s.bits.w_replResp && (!s.bits.w_rprobeacklast || s.bits.w_releaseack) && !s.bits.willFree
)).asUInt
val replaceBlockSnp = replaceBlockSnpMask.orR
val replaceNestSnpMask = VecInit(io.msInfo.map(s =>
s.valid && s.bits.set === task.set && s.bits.metaTag === task.tag && !s.bits.dirHit && s.bits.metaState =/= INVALID &&
s.bits.w_replResp && s.bits.w_rprobeacklast && !s.bits.w_releaseack
)).asUInt
val replaceDataMask = VecInit(io.msInfo.map(_.bits.replaceData)).asUInt

task := fromSnpToTaskBundle(io.rxsnp.bits)

val stall = reqBlockSnp || replaceBlockSnp // addrConflict || replaceConflict
io.task.valid := io.rxsnp.valid && !stall
io.task.bits := task
io.rxsnp.ready := io.task.ready && !stall

val stallCnt = RegInit(0.U(64.W))
when(io.rxsnp.fire) {
stallCnt := 0.U
}.elsewhen(io.rxsnp.valid && !io.rxsnp.ready) {
stallCnt := stallCnt + 1.U
}

val STALL_CNT_MAX = 28000.U
assert(stallCnt <= STALL_CNT_MAX, "stallCnt full! maybe there is a deadlock! addr => 0x%x req_opcode => %d txn_id => %d", io.rxsnp.bits.addr, io.rxsnp.bits.opcode, io.rxsnp.bits.txnID);

assert(!(stall && io.rxsnp.fire))

def fromSnpToTaskBundle(snp: CHISNP): TaskBundle = {
val task = WireInit(0.U.asTypeOf(new TaskBundle))
task.channel := "b010".U
// Addr in CHI SNP channel has 3 fewer bits than full address
val snpFullAddr = Cat(snp.addr, 0.U(3.W))
task.tag := parseAddress(snpFullAddr)._1
task.set := parseAddress(snpFullAddr)._2
task.off := parseAddress(snpFullAddr)._3
task.alias.foreach(_ := 0.U)
task.vaddr.foreach(_ := 0.U)
task.isKeyword.foreach(_ := false.B)
// task.opcode := snp.opcode
task.param := 0.U
task.size := log2Up(cacheParams.blockBytes).U
task.sourceId := 0.U(sourceIdBits.W)
task.bufIdx := 0.U(bufIdxBits.W)
task.needProbeAckData := false.B
task.mshrTask := false.B
task.mshrId := 0.U(mshrBits.W)
task.aliasTask.foreach(_ := false.B)
task.useProbeData := false.B
task.mshrRetry := false.B
task.fromL2pft.foreach(_ := false.B)
task.needHint.foreach(_ := false.B)
task.dirty := false.B
task.way := 0.U(wayBits.W)
task.meta := 0.U.asTypeOf(new MetaEntry)
task.metaWen := false.B
task.tagWen := false.B
task.dsWen := false.B
task.wayMask := Fill(cacheParams.ways, "b1".U)
task.reqSource := MemReqSource.NoWhere.id.U
task.replTask := false.B
task.mergeA := false.B
task.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle)
task.snpHitRelease := replaceNestSnpMask.orR
task.snpHitReleaseWithData := (replaceNestSnpMask & replaceDataMask).orR
task.snpHitReleaseIdx := PriorityEncoder(replaceNestSnpMask)
task.tgtID.foreach(_ := 0.U) // TODO
task.srcID.foreach(_ := snp.srcID)
task.txnID.foreach(_ := snp.txnID)
task.dbID.foreach(_ := 0.U)
task.fwdNID.foreach(_ := snp.fwdNID)
task.fwdTxnID.foreach(_ := snp.fwdTxnID)
task.chiOpcode.foreach(_ := snp.opcode)
task.pCrdType.foreach(_ := 0.U)
task.retToSrc.foreach(_ := snp.retToSrc)
task
}

}
207 changes: 207 additions & 0 deletions src/main/scala/coupledL2/tl2chi/Slice.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import freechips.rocketchip.tilelink._
import org.chipsalliance.cde.config.Parameters
import coupledL2._
import coupledL2.prefetch.PrefetchIO

class Slice()(implicit p: Parameters) extends TL2CHIL2Module {
val io = IO(new Bundle() {
val in = Flipped(TLBundle(edgeIn.bundle))
val out = new DecoupledPortIO
val sliceId = Input(UInt(bankBits.W))
val l1Hint = Decoupled(new L2ToL1Hint())
val waitPCrdInfo = Output(Vec(mshrsAll, new PCrdInfo))
val prefetch = prefetchOpt.map(_ => Flipped(new PrefetchIO))
val msStatus = topDownOpt.map(_ => Vec(mshrsAll, ValidIO(new MSHRStatus)))
val dirResult = topDownOpt.map(_ => ValidIO(new DirResult))
val latePF = topDownOpt.map(_ => Output(Bool()))
})

/* Upwards TileLink-related modules */
val sinkA = Module(new SinkA)
val sinkC = Module(new SinkC)
val grantBuf = Module(new GrantBuffer)

/* Downwards CHI-related modules */
val txreq = Module(new TXREQ())
val txdat = Module(new TXDAT())
val txrsp = Module(new TXRSP())
val rxsnp = Module(new RXSNP())
val rxdat = Module(new RXDAT())
val rxrsp = Module(new RXRSP())

/* Data path and control path */
val directory = Module(new Directory())
val dataStorage = Module(new DataStorage())
val refillBuf = Module(new MSHRBuffer(wPorts = 2))
val releaseBuf = Module(new MSHRBuffer(wPorts = 3))

val reqArb = Module(new RequestArb())
val mainPipe = Module(new MainPipe())
val reqBuf = Module(new RequestBuffer())
val mshrCtl = Module(new MSHRCtl())

sinkC.io.msInfo := mshrCtl.io.msInfo

grantBuf.io.d_task <> mainPipe.io.toSourceD
grantBuf.io.fromReqArb.status_s1 := reqArb.io.status_s1
grantBuf.io.pipeStatusVec := reqArb.io.status_vec ++ mainPipe.io.status_vec_toD

val status_vec_toTX = reqArb.io.status_vec_toTX.get ++ mainPipe.io.status_vec_toTX
txreq.io.pipeReq <> mainPipe.io.toTXREQ
txreq.io.mshrReq <> mshrCtl.io.toTXREQ
txreq.io.pipeStatusVec := status_vec_toTX
txreq.io.sliceId := io.sliceId

txdat.io.in <> mainPipe.io.toTXDAT
txdat.io.pipeStatusVec := status_vec_toTX

txrsp.io.pipeRsp <> mainPipe.io.toTXRSP
txrsp.io.mshrRsp <> mshrCtl.io.toTXRSP
txrsp.io.pipeStatusVec := status_vec_toTX

rxsnp.io.msInfo := mshrCtl.io.msInfo

directory.io.read <> reqArb.io.dirRead_s1
directory.io.metaWReq := mainPipe.io.metaWReq
directory.io.tagWReq := mainPipe.io.tagWReq
directory.io.msInfo := mshrCtl.io.msInfo

dataStorage.io.req := mainPipe.io.toDS.req_s3
dataStorage.io.wdata := mainPipe.io.toDS.wdata_s3

reqArb.io.ATag := reqBuf.io.ATag
reqArb.io.ASet := reqBuf.io.ASet
reqArb.io.sinkA <> reqBuf.io.out
reqArb.io.sinkB <> rxsnp.io.task
reqArb.io.sinkC <> sinkC.io.task
reqArb.io.mshrTask <> mshrCtl.io.mshrTask
reqArb.io.fromMSHRCtl := mshrCtl.io.toReqArb
reqArb.io.fromMainPipe := mainPipe.io.toReqArb
reqArb.io.fromGrantBuffer := grantBuf.io.toReqArb
reqArb.io.fromTXDAT.foreach(_ := txdat.io.toReqArb)
reqArb.io.fromTXRSP.foreach(_ := txrsp.io.toReqArb)
reqArb.io.fromTXREQ.foreach(_ := txreq.io.toReqArb)
reqArb.io.msInfo := mshrCtl.io.msInfo

reqBuf.io.in <> sinkA.io.task
reqBuf.io.mshrInfo := mshrCtl.io.msInfo
reqBuf.io.mainPipeBlock := mainPipe.io.toReqBuf
reqBuf.io.s1Entrance := reqArb.io.s1Entrance

mainPipe.io.taskFromArb_s2 := reqArb.io.taskToPipe_s2
mainPipe.io.taskInfo_s1 := reqArb.io.taskInfo_s1
mainPipe.io.fromReqArb.status_s1 := reqArb.io.status_s1
mainPipe.io.bufResp := sinkC.io.bufResp
mainPipe.io.dirResp_s3 := directory.io.resp.bits
mainPipe.io.replResp := directory.io.replResp
mainPipe.io.fromMSHRCtl <> mshrCtl.io.toMainPipe
mainPipe.io.bufResp := sinkC.io.bufResp
mainPipe.io.refillBufResp_s3.valid := RegNext(refillBuf.io.r.valid, false.B)
mainPipe.io.refillBufResp_s3.bits := refillBuf.io.resp.data
mainPipe.io.releaseBufResp_s3.valid := RegNext(releaseBuf.io.r.valid, false.B)
mainPipe.io.releaseBufResp_s3.bits := releaseBuf.io.resp.data
mainPipe.io.toDS.rdata_s5 := dataStorage.io.rdata
// mainPipe.io.grantBufferHint := grantBuf.io.l1Hint
// mainPipe.io.globalCounter := grantBuf.io.globalCounter

mshrCtl.io.fromReqArb.status_s1 := reqArb.io.status_s1
mshrCtl.io.fromMainPipe <> mainPipe.io.toMSHRCtl
mshrCtl.io.fromMainPipe.mshr_alloc_s3 := mainPipe.io.toMSHRCtl.mshr_alloc_s3
mshrCtl.io.grantStatus := grantBuf.io.grantStatus
mshrCtl.io.resps.sinkC := sinkC.io.resp
mshrCtl.io.resps.rxrsp := rxrsp.io.in
mshrCtl.io.resps.rxdat := rxdat.io.in
mshrCtl.io.nestedwb := mainPipe.io.nestedwb
mshrCtl.io.replResp := directory.io.replResp
mshrCtl.io.aMergeTask := reqBuf.io.aMergeTask
// TODO: This is ugly
mshrCtl.io.pipeStatusVec(0) := (reqArb.io.status_vec)(1) // s2 status
mshrCtl.io.pipeStatusVec(1) := mainPipe.io.status_vec_toD(0) // s3 status

/* Read and write release buffer */
releaseBuf.io.r := reqArb.io.releaseBufRead_s2
val nestedWriteReleaseBuf,
sinkCWriteReleaseBuf,
mpWriteReleaseBuf = Wire(Valid(new MSHRBufWrite()))
nestedWriteReleaseBuf.valid := mshrCtl.io.nestedwbDataId.valid
nestedWriteReleaseBuf.bits.data := mainPipe.io.nestedwbData
nestedWriteReleaseBuf.bits.id := mshrCtl.io.nestedwbDataId.bits
nestedWriteReleaseBuf.bits.beatMask := Fill(beatSize, true.B)
sinkCWriteReleaseBuf match { case x =>
x := sinkC.io.releaseBufWrite
x.bits.id := mshrCtl.io.releaseBufWriteId
}
mpWriteReleaseBuf := mainPipe.io.releaseBufWrite
releaseBuf.io.w <> VecInit(Seq(
nestedWriteReleaseBuf,
sinkCWriteReleaseBuf,
mpWriteReleaseBuf
))

/* Read and write refill buffer */
refillBuf.io.r := reqArb.io.refillBufRead_s2
refillBuf.io.w <> VecInit(Seq(rxdat.io.refillBufWrite, sinkC.io.refillBufWrite))

io.prefetch.foreach { p =>
p.train <> mainPipe.io.prefetchTrain.get
sinkA.io.prefetchReq.get <> p.req
p.resp <> grantBuf.io.prefetchResp.get
p.tlb_req.req.ready := true.B
p.tlb_req.resp.valid := false.B
p.tlb_req.resp.bits := DontCare
p.recv_addr := 0.U.asTypeOf(p.recv_addr)
}

/* to Slice Top for pCrd info.*/
io.waitPCrdInfo <> mshrCtl.io.waitPCrdInfo

/* IO Connection */
io.l1Hint <> mainPipe.io.l1Hint
topDownOpt.foreach (
_ => {
io.msStatus.get := mshrCtl.io.msStatus.get
io.dirResult.get.valid := directory.io.resp.valid && !directory.io.replResp.valid // exclude MSHR-Grant read-dir
io.dirResult.get.bits := directory.io.resp.bits
io.latePF.get := reqBuf.io.hasLatePF
}
)

/* Connect upwards channels */
val inBuf = cacheParams.innerBuf
// val outBuf = tl2tlParams.outerBuf
sinkA.io.a <> inBuf.a(io.in.a)
io.in.b <> inBuf.b(mshrCtl.io.toSourceB)
sinkC.io.c <> inBuf.c(io.in.c)
io.in.d <> inBuf.d(grantBuf.io.d)
grantBuf.io.e <> inBuf.e(io.in.e)

/* Connect downwards channels */
io.out.tx.req <> txreq.io.out
io.out.tx.dat <> txdat.io.out
io.out.tx.rsp <> txrsp.io.out
rxsnp.io.rxsnp <> io.out.rx.snp
rxdat.io.out <> io.out.rx.dat
rxrsp.io.out <> io.out.rx.rsp

}
492 changes: 492 additions & 0 deletions src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala

Large diffs are not rendered by default.

132 changes: 132 additions & 0 deletions src/main/scala/coupledL2/tl2chi/TXDAT.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import utility._
import org.chipsalliance.cde.config.Parameters
import coupledL2.{TaskWithData, TaskBundle}

class TXDATBlockBundle(implicit p: Parameters) extends TXBlockBundle {
val blockSinkBReqEntrance = Bool()

override def apply() = 0.U.asTypeOf(this)
}

class TXDAT(implicit p: Parameters) extends TL2CHIL2Module {
val io = IO(new Bundle() {
val in = Flipped(DecoupledIO(new TaskWithData()))
val out = DecoupledIO(new CHIDAT())

val pipeStatusVec = Flipped(Vec(5, ValidIO(new PipeStatusWithCHI)))
val toReqArb = Output(new TXDATBlockBundle)
})

assert(!io.in.valid || io.in.bits.task.toTXDAT, "txChannel is wrong for TXDAT")
assert(io.in.ready, "TXDAT should never be full")
require(chiOpt.isDefined)
require(beatBytes * 8 == DATA_WIDTH)

// TODO: an mshrsAll-entry queue is too much, evaluate for a proper size later
val queue = Module(new Queue(io.in.bits.cloneType, entries = mshrsAll, flow = true))
queue.io.enq <> io.in

// Back pressure logic from TXDAT
val queueCnt = queue.io.count
// TODO: this may be imprecise, review this later
val pipeStatus_s1_s5 = io.pipeStatusVec
val pipeStatus_s1_s2 = pipeStatus_s1_s5.take(2)
val pipeStatus_s2 = pipeStatus_s1_s2.tail
val pipeStatus_s3_s5 = pipeStatus_s1_s5.drop(2)
// inflightCnt equals the number of reqs on s2~s5 that may flow into TXDAT soon, plus queueCnt.
// The calculation of inflightCnt might be imprecise and leads to false positive back pressue.
val inflightCnt = PopCount(Cat(pipeStatus_s3_s5.map(s => s.valid && s.bits.toTXDAT && (s.bits.fromB || s.bits.mshrTask)))) +
PopCount(Cat(pipeStatus_s2.map(s => s.valid && Mux(s.bits.mshrTask, s.bits.toTXDAT, s.bits.fromB)))) +
queueCnt
val noSpaceForSinkBReq = inflightCnt >= mshrsAll.U
val noSpaceForMSHRReq = inflightCnt >= (mshrsAll-1).U

io.toReqArb.blockSinkBReqEntrance := noSpaceForSinkBReq
io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq

val beatValids = RegInit(VecInit(Seq.fill(beatSize)(false.B)))
val taskValid = beatValids.asUInt.orR
val taskR = RegInit(0.U.asTypeOf(new TaskWithData))

val dequeueReady = !taskValid // TODO: this may introduce bubble?
queue.io.deq.ready := dequeueReady
when (queue.io.deq.fire) {
beatValids.foreach(_ := true.B)
taskR := queue.io.deq.bits
}

val data = taskR.data.data
val beatsOH = beatValids.asUInt
val (beat, next_beatsOH) = getBeat(data, beatsOH)

io.out.valid := taskValid
io.out.bits := toCHIDATBundle(taskR.task, beat, beatsOH)

when (io.out.fire) {
beatValids := VecInit(next_beatsOH.asBools)
}

def getBeat(data: UInt, beatsOH: UInt): (UInt, UInt) = {
// get one beat from data according to beatsOH
require(data.getWidth == (blockBytes * 8))
require(beatsOH.getWidth == beatSize)
// next beat
val next_beat = ParallelPriorityMux(beatsOH, data.asTypeOf(Vec(beatSize, UInt((beatBytes * 8).W))))
val selOH = PriorityEncoderOH(beatsOH)
// remaining beats that haven't been sent out
val next_beatsOH = beatsOH & ~selOH
(next_beat, next_beatsOH)
}

def toCHIDATBundle(task: TaskBundle, beat: UInt, beatsOH: UInt): CHIDAT = {
val dat = WireInit(0.U.asTypeOf(new CHIDAT()))

// width parameters and width check
require(beat.getWidth == dat.data.getWidth)
val beatOffsetWidth = log2Up(beatBytes)
val chunkOffsetWidth = log2Up(16) // DataID is assigned with the granularity of a 16-byte chunk

dat.tgtID := task.tgtID.get
dat.srcID := task.srcID.get
dat.txnID := task.txnID.get
dat.homeNID := task.homeNID.get
dat.dbID := task.dbID.get
dat.opcode := task.chiOpcode.get
dat.ccID := 0.U // TODO: consider critical chunk id
// The DataID field value must be set to Addr[5:4] because the DataID field represents Addr[5:4] of the lowest
// addressed byte within the packet.
// dat.dataID := ParallelPriorityMux(beatsOH.asBools.zipWithIndex.map(x => (x._1, (x._2 << beatOffsetWidth).U(5, 4))))
dat.dataID := ParallelPriorityMux(
beatsOH,
List.tabulate(beatSize)(i => (i << (beatOffsetWidth - chunkOffsetWidth)).U)
)
dat.be := Fill(BE_WIDTH, 1.U(1.W))
dat.data := beat
dat.resp := task.resp.get
dat.fwdState := task.fwdState.get

dat
}

}
76 changes: 76 additions & 0 deletions src/main/scala/coupledL2/tl2chi/TXREQ.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import utility._
import org.chipsalliance.cde.config.Parameters

class TXBlockBundle(implicit p: Parameters) extends TL2CHIL2Bundle {
// val blockSinkBReqEntrance = Bool()
val blockMSHRReqEntrance = Bool()

def apply() = 0.U.asTypeOf(this)
}

class TXREQ(implicit p: Parameters) extends TL2CHIL2Module {
val io = IO(new Bundle() {
val pipeReq = Flipped(DecoupledIO(new CHIREQ()))
val mshrReq = Flipped(DecoupledIO(new CHIREQ()))
val out = DecoupledIO(new CHIREQ())

val pipeStatusVec = Flipped(Vec(5, ValidIO(new PipeStatusWithCHI)))
val toReqArb = Output(new TXBlockBundle)

val sliceId = Input(UInt(bankBits.W))
})

assert(!io.pipeReq.valid || io.pipeReq.ready, "TXREQ should always be ready for pipeline req")
require(chiOpt.isDefined)

// TODO: an mshrsAll-entry queue is too much, evaluate for a proper size later
val queue = Module(new Queue(new CHIREQ, entries = mshrsAll, flow = true))

// Back pressure logic from TXREQ
val queueCnt = queue.io.count
// TODO: this may be imprecise, review this later
val pipeStatus_s1_s5 = io.pipeStatusVec
val pipeStatus_s2_s5 = pipeStatus_s1_s5.tail
val pipeStatus_s1 = pipeStatus_s1_s5.head
// inflightCnt equals the number of reqs on s2~s5 that may flow into TXREQ soon, plus queueCnt.
// The calculation of inflightCnt might be imprecise and leads to false positive back pressue.
val inflightCnt = PopCount(Cat(pipeStatus_s2_s5.map(s => s.valid && s.bits.mshrTask && s.bits.toTXREQ))) +
pipeStatus_s1.valid.asUInt +
queueCnt
val noSpace = inflightCnt >= mshrsAll.U

io.toReqArb.blockMSHRReqEntrance := noSpace

queue.io.enq.valid := io.pipeReq.valid || io.mshrReq.valid && !noSpace
queue.io.enq.bits := Mux(io.pipeReq.valid, io.pipeReq.bits, io.mshrReq.bits)

io.pipeReq.ready := true.B
io.mshrReq.ready := !io.pipeReq.valid && !noSpace

// Decoupled2LCredit(queue.io.deq, io.out)
io.out <> queue.io.deq
io.out.bits.tgtID := SAM(sam).lookup(io.out.bits.addr)
io.out.bits.size := log2Ceil(blockBytes).U(SIZE_WIDTH.W) // TODO
io.out.bits.addr := restoreAddressUInt(queue.io.deq.bits.addr, io.sliceId)
}
91 changes: 91 additions & 0 deletions src/main/scala/coupledL2/tl2chi/TXRSP.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.tl2chi

import chisel3._
import chisel3.util._
import utility._
import org.chipsalliance.cde.config.Parameters
import coupledL2.TaskBundle

class TXRSPBlockBundle(implicit p: Parameters) extends TXBlockBundle {
val blockSinkBReqEntrance = Bool()

override def apply() = 0.U.asTypeOf(this)
}

class TXRSP(implicit p: Parameters) extends TL2CHIL2Module {
val io = IO(new Bundle() {
// val in = Flipped(DecoupledIO(new TaskBundle()))
val pipeRsp = Flipped(DecoupledIO(new TaskBundle))
val mshrRsp = Flipped(DecoupledIO(new CHIRSP()))
val out = DecoupledIO(new CHIRSP())

val pipeStatusVec = Flipped(Vec(5, ValidIO(new PipeStatusWithCHI)))
val toReqArb = Output(new TXRSPBlockBundle)
})

assert(!io.pipeRsp.valid || io.pipeRsp.bits.toTXRSP, "txChannel is wrong for TXRSP")
assert(io.pipeRsp.ready, "TXRSP should never be full")
require(chiOpt.isDefined)

// TODO: an mshrsAll-entry queue is too much, evaluate for a proper size later
val queue = Module(new Queue(new CHIRSP, entries = mshrsAll, flow = true))

// Back pressure logic from TXRSP
val queueCnt = queue.io.count
// TODO: this may be imprecise, review this later
val pipeStatus_s1_s5 = io.pipeStatusVec
val pipeStatus_s1_s2 = pipeStatus_s1_s5.take(2)
val pipeStatus_s2 = pipeStatus_s1_s2.tail
val pipeStatus_s3_s5 = pipeStatus_s1_s5.drop(2)
// inflightCnt equals the number of reqs on s2~s5 that may flow into TXRSP soon, plus queueCnt.
// The calculation of inflightCnt might be imprecise and leads to false positive back pressue.
val inflightCnt = PopCount(Cat(pipeStatus_s3_s5.map(s => s.valid && s.bits.toTXRSP && (s.bits.fromB || s.bits.mshrTask)))) +
PopCount(Cat(pipeStatus_s2.map(s => s.valid && Mux(s.bits.mshrTask, s.bits.toTXRSP, s.bits.fromB)))) +
queueCnt
val noSpaceForSinkBReq = inflightCnt >= mshrsAll.U
val noSpaceForMSHRReq = inflightCnt >= (mshrsAll-1).U

io.toReqArb.blockSinkBReqEntrance := noSpaceForSinkBReq
io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq

io.out.valid := queue.io.deq.valid
io.out.bits := queue.io.deq.bits
queue.io.deq.ready := io.out.ready

queue.io.enq.valid := io.pipeRsp.valid || io.mshrRsp.valid && !noSpaceForSinkBReq && !noSpaceForMSHRReq
queue.io.enq.bits := Mux(io.pipeRsp.valid, toCHIRSPBundle(io.pipeRsp.bits), io.mshrRsp.bits)

io.pipeRsp.ready := true.B
io.mshrRsp.ready := !io.pipeRsp.valid && !noSpaceForSinkBReq && !noSpaceForMSHRReq

def toCHIRSPBundle(task: TaskBundle): CHIRSP = {
val rsp = WireInit(0.U.asTypeOf(new CHIRSP()))
rsp.tgtID := task.tgtID.get
rsp.srcID := task.srcID.get
rsp.txnID := task.txnID.get
rsp.dbID := task.dbID.get
rsp.pCrdType := task.pCrdType.get
rsp.opcode := task.chiOpcode.get
rsp.resp := task.resp.get
rsp.fwdState := task.fwdState.get
// TODO: Finish this
rsp
}
}
Loading