OpenXiangShan · Kumonda221-CrO3 · May 7, 2024 · Mar 20, 2024 · Mar 26, 2024 · Mar 29, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -6,9 +6,9 @@ name: CI
 on:
   # Triggers the workflow on push or pull request events but only for the main branch
   push:
-    branches: [ chi-coupledl2, chi-coupledl2-ci-test ]
+    branches: [ master, chi-coupledl2 ]
   pull_request:
-    branches: [ chi-coupledl2, chi-coupledl2-ci-test ]
+    branches: [ master, chi-coupledl2 ]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
@@ -45,9 +45,19 @@ jobs:
       # - name: Check scalafmt
       #   run: make checkformat
 
-      # - name: Compile
-      #   run: make compile
-
-      - name: Compile QuadCore
+      - name: Compile
+        run: make compile
+
+      - name: Unit test for TileLink version
+        run: |
+            git clone https://github.com/OpenXiangShan/tl-test -b coupledL2-huancun
+            make test-top-l2l3l2
+            cd ./tl-test
+            mkdir build && cd build
+            cmake .. -DDUT_DIR=../../build -DCHISELDB=1
+            make
+            ./tlc_test -s $RANDOM
+
+      - name: Compile CHI QuadCore
         run: |
             make test-top-chi-quadcore-2ul
diff --git a/src/main/scala/coupledL2/Common.scala b/src/main/scala/coupledL2/Common.scala
@@ -316,3 +316,48 @@ class L2ToL1Hint(implicit p: Parameters) extends Bundle {
   val sourceId = UInt(32.W)    // tilelink sourceID
   val isKeyword = Bool()       // miss entry keyword
 }
+
+// custom l2 - l1 tlb
+// FIXME lyq: Tlbcmd and TlbExceptionBundle, how to use L1 corresponding bundles?
+object TlbCmd {
+  def read  = "b00".U
+  def write = "b01".U
+  def exec  = "b10".U
+
+  def atom_read  = "b100".U // lr
+  def atom_write = "b101".U // sc / amo
+
+  def apply() = UInt(3.W)
+  def isRead(a: UInt) = a(1,0)===read
+  def isWrite(a: UInt) = a(1,0)===write
+  def isExec(a: UInt) = a(1,0)===exec
+
+  def isAtom(a: UInt) = a(2)
+  def isAmo(a: UInt) = a===atom_write // NOTE: sc mixed
+}
+class TlbExceptionBundle extends Bundle {
+  val ld = Output(Bool())
+  val st = Output(Bool())
+  val instr = Output(Bool())
+}
+class L2TlbReq(implicit p: Parameters) extends L2Bundle{
+  val vaddr = Output(UInt((fullVAddrBits+offsetBits).W))
+  val cmd = Output(TlbCmd())
+  val size = Output(UInt(log2Ceil(log2Ceil(XLEN/8) + 1).W))
+  val kill = Output(Bool()) // Use for blocked tlb that need sync with other module like icache
+  val no_translate = Output(Bool()) // do not translate, but still do pmp/pma check
+}
+class L2TlbResp(nDups: Int = 1)(implicit p: Parameters) extends L2Bundle {
+  val paddr = Vec(nDups, Output(UInt(fullAddressBits.W)))
+  val miss = Output(Bool())
+  val excp = Vec(nDups, new Bundle {
+    val gpf = new TlbExceptionBundle()
+    val pf = new TlbExceptionBundle()
+    val af = new TlbExceptionBundle()
+  })
+}
+class L2ToL1TlbIO(nRespDups: Int = 1)(implicit p: Parameters) extends L2Bundle{
+  val req = DecoupledIO(new L2TlbReq)
+  val req_kill = Output(Bool())
+  val resp = Flipped(DecoupledIO(new L2TlbResp(nRespDups)))
+}
diff --git a/src/main/scala/coupledL2/CoupledL2.scala b/src/main/scala/coupledL2/CoupledL2.scala
@@ -38,6 +38,7 @@ trait HasCoupledL2Parameters {
   val enableCHI = p(EnableCHI)
   val cacheParams = p(L2ParamKey)
 
+  val XLEN = 64
   val blocks = cacheParams.sets * cacheParams.ways
   val blockBytes = cacheParams.blockBytes
   val beatBytes = cacheParams.channelBytes.d.get
@@ -51,8 +52,10 @@ trait HasCoupledL2Parameters {
   val chiOpt = if (enableCHI) Some(true) else None
   val aliasBitsOpt = if(cacheParams.clientCaches.isEmpty) None
                   else cacheParams.clientCaches.head.aliasBitsOpt
+  // vaddr without offset bits
   val vaddrBitsOpt = if(cacheParams.clientCaches.isEmpty) None
                   else cacheParams.clientCaches.head.vaddrBitsOpt
+  val fullVAddrBits = vaddrBitsOpt.getOrElse(0) + offsetBits
   // from L1 load miss cache require
   val isKeywordBitsOpt = if(cacheParams.clientCaches.isEmpty) None
                   else cacheParams.clientCaches.head.isKeywordBitsOpt

diff --git a/src/main/scala/coupledL2/GrantBuffer.scala b/src/main/scala/coupledL2/GrantBuffer.scala
@@ -216,6 +216,8 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
   val pftRespEntry = new Bundle() {
     val tag = UInt(tagBits.W)
     val set = UInt(setBits.W)
+    val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
+    val pfSource = UInt(MemReqSource.reqSourceBits.W)
   }
   // TODO: this may not need 10 entries, but this does not take much space
   val pftQueueLen = 10
@@ -225,11 +227,15 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
       io.d_task.bits.task.fromL2pft.getOrElse(false.B)
     pftRespQueue.get.io.enq.bits.tag := io.d_task.bits.task.tag
     pftRespQueue.get.io.enq.bits.set := io.d_task.bits.task.set
+    pftRespQueue.get.io.enq.bits.vaddr.foreach(_ := io.d_task.bits.task.vaddr.getOrElse(0.U))
+    pftRespQueue.get.io.enq.bits.pfSource := io.d_task.bits.task.reqSource
 
     val resp = io.prefetchResp.get
     resp.valid := pftRespQueue.get.io.deq.valid
     resp.bits.tag := pftRespQueue.get.io.deq.bits.tag
     resp.bits.set := pftRespQueue.get.io.deq.bits.set
+    resp.bits.vaddr.foreach(_ := pftRespQueue.get.io.deq.bits.vaddr.getOrElse(0.U))
+    resp.bits.pfSource := pftRespQueue.get.io.deq.bits.pfSource
     pftRespQueue.get.io.deq.ready := resp.ready
 
     assert(pftRespQueue.get.io.enq.ready, "pftRespQueue should never be full, no back pressure logic")

diff --git a/src/main/scala/coupledL2/SinkA.scala b/src/main/scala/coupledL2/SinkA.scala
@@ -93,8 +93,8 @@ class SinkA(implicit p: Parameters) extends L2Module {
     task.mshrId := 0.U(mshrBits.W)
     task.aliasTask.foreach(_ := false.B)
     task.useProbeData := false.B
+    task.fromL2pft.foreach(_ := req.needAck)
     task.mshrRetry := false.B
-    task.fromL2pft.foreach(_ := req.isBOP)
     task.needHint.foreach(_ := false.B)
     task.dirty := false.B
     task.way := 0.U(wayBits.W)
@@ -105,7 +105,7 @@ class SinkA(implicit p: Parameters) extends L2Module {
     task.wayMask := 0.U(cacheParams.ways.W)
     task.reqSource := req.pfSource
     task.replTask := false.B
-    task.vaddr.foreach(_ := 0.U)
+    task.vaddr.foreach(_ := req.vaddr.getOrElse(0.U))
     task.isKeyword.foreach(_ := false.B)
     task.mergeA := false.B
     task.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle)
@@ -137,8 +137,8 @@ class SinkA(implicit p: Parameters) extends L2Module {
   prefetchOpt.foreach {
     _ =>
       XSPerfAccumulate(cacheParams, "sinkA_prefetch_req", io.prefetchReq.get.fire)
-      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l2", io.prefetchReq.get.bits.isBOP && io.prefetchReq.get.fire)
-      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l1", !io.prefetchReq.get.bits.isBOP && io.prefetchReq.get.fire)
+      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l2", io.prefetchReq.get.bits.fromL2 && io.prefetchReq.get.fire)
+      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l1", !io.prefetchReq.get.bits.fromL2 && io.prefetchReq.get.fire)
   }
 
   // cycels stalled by mainpipe

diff --git a/src/main/scala/coupledL2/TopDownMonitor.scala b/src/main/scala/coupledL2/TopDownMonitor.scala
@@ -32,6 +32,7 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
     val msStatus  = Vec(banks, Vec(mshrsAll, Flipped(ValidIO(new MSHRStatus))))
     val latePF    = Vec(banks, Input(Bool()))
     val debugTopDown = new Bundle {
+      val robTrueCommit = Input(UInt(64.W))
       val robHeadPaddr = Vec(cacheParams.hartIds.length, Flipped(Valid(UInt(36.W))))
       val l2MissMatch = Vec(cacheParams.hartIds.length, Output(Bool()))
     }
@@ -114,6 +115,7 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
   val l2prefetchSent = dirResultMatchVec(
     r =>  !r.hit &&
       (r.replacerInfo.reqSource === MemReqSource.Prefetch2L2BOP.id.U ||
+       r.replacerInfo.reqSource === MemReqSource.Prefetch2L2PBOP.id.U ||
        r.replacerInfo.reqSource === MemReqSource.Prefetch2L2SMS.id.U ||
        r.replacerInfo.reqSource === MemReqSource.Prefetch2L2Stride.id.U ||
        r.replacerInfo.reqSource === MemReqSource.Prefetch2L2Stream.id.U ||
@@ -122,6 +124,9 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
   val l2prefetchSentBOP = dirResultMatchVec(
     r => !r.hit && r.replacerInfo.reqSource === MemReqSource.Prefetch2L2BOP.id.U
   )
+  val l2prefetchSentPBOP = dirResultMatchVec(
+    r => !r.hit && r.replacerInfo.reqSource === MemReqSource.Prefetch2L2PBOP.id.U
+  )
   val l2prefetchSentSMS = dirResultMatchVec(
     r => !r.hit && r.replacerInfo.reqSource === MemReqSource.Prefetch2L2SMS.id.U
   )
@@ -142,6 +147,10 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
     r => reqFromCPU(r) && r.hit &&
       r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === PfSource.BOP.id.U
   )
+  val l2prefetchUsefulPBOP = dirResultMatchVec(
+    r => reqFromCPU(r) && r.hit &&
+      r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === PfSource.PBOP.id.U
+  )
   val l2prefetchUsefulSMS = dirResultMatchVec(
     r => reqFromCPU(r) && r.hit &&
       r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === PfSource.SMS.id.U
@@ -168,81 +177,91 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracy",
     PopCount(l2prefetchUseful), PopCount(l2prefetchSent),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyBOP",
     PopCount(l2prefetchUsefulBOP), PopCount(l2prefetchSentBOP),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
+  )
+  XSPerfRolling(
+    cacheParams, "L2PrefetchAccuracyPBOP",
+    PopCount(l2prefetchUsefulPBOP), PopCount(l2prefetchSentPBOP),
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracySMS",
     PopCount(l2prefetchUsefulSMS), PopCount(l2prefetchSentSMS),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2prefetchSentTP),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyStride",
     PopCount(l2prefetchUsefulStride), PopCount(l2prefetchSentStride),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyStream",
     PopCount(l2prefetchUsefulStream), PopCount(l2prefetchSentStream),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2prefetchSentTP),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
 
   // PF Late
   XSPerfRolling(
     cacheParams, "L2PrefetchLate",
     PopCount(l2prefetchLate), PopCount(l2prefetchUseful),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
 
   // PF Coverage
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverage",
     PopCount(l2prefetchUseful), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageBOP",
     PopCount(l2prefetchUsefulBOP), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
+  )
+  XSPerfRolling(
+    cacheParams, "L2PrefetchCoveragePBOP",
+    PopCount(l2prefetchUsefulPBOP), PopCount(l2demandRequest),
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageSMS",
     PopCount(l2prefetchUsefulSMS), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageStride",
     PopCount(l2prefetchUsefulStride), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageStream",
     PopCount(l2prefetchUsefulStream), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
 
   XSPerfAccumulate(cacheParams, "l2prefetchSent", PopCount(l2prefetchSent))