From 2a1d3b0f9d0dc29a0535769623827f5bad61f24d Mon Sep 17 00:00:00 2001
From: Tang Haojin <tanghaojin@outlook.com>
Date: Wed, 20 Mar 2024 16:07:19 +0800
Subject: [PATCH 1/9] L2Param: fix inconsistency of chiselDB availability (#98)

---
 HuanCun                                                 | 2 +-
 src/main/scala/coupledL2/L2Param.scala                  | 2 ++
 src/main/scala/coupledL2/Slice.scala                    | 9 ++-------
 src/main/scala/coupledL2/debug/Monitor.scala            | 2 +-
 src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala | 4 ++--
 5 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/HuanCun b/HuanCun
index a4b0c8a89..fdd56a568 160000
--- a/HuanCun
+++ b/HuanCun
@@ -1 +1 @@
-Subproject commit a4b0c8a89d4360dacd6d910efe794bd638ea5596
+Subproject commit fdd56a56874b83f295bfb336a02d923342c933f6
diff --git a/src/main/scala/coupledL2/L2Param.scala b/src/main/scala/coupledL2/L2Param.scala
index 7f4c69c9a..3522ef732 100644
--- a/src/main/scala/coupledL2/L2Param.scala
+++ b/src/main/scala/coupledL2/L2Param.scala
@@ -95,6 +95,8 @@ case class L2Param
   prefetch: Option[PrefetchParameters] = None,
   // Performance analysis
   enablePerf: Boolean = true,
+  // RollingDB
+  enableRollingDB: Boolean = true,
   // Monitor
   enableMonitor: Boolean = true,
   // TopDown
diff --git a/src/main/scala/coupledL2/Slice.scala b/src/main/scala/coupledL2/Slice.scala
index ea0275254..a81a804f5 100644
--- a/src/main/scala/coupledL2/Slice.scala
+++ b/src/main/scala/coupledL2/Slice.scala
@@ -195,11 +195,6 @@ class Slice()(implicit p: Parameters) extends L2Module {
     XSPerfHistogram(cacheParams, "a_to_d_delay", delay, delay_sample, 500, 1000, 100, true, false)
   }
 
-  if (cacheParams.enableMonitor) {
-    val monitor = Module(new Monitor())
-    monitor.io.fromMainPipe <> mainPipe.io.toMonitor
-//  monitor.io.nestedWBValid := mshrCtl.io.nestedwbDataId.valid
-  } else {
-    mainPipe.io.toMonitor <> DontCare
-  }
+  val monitor = Module(new Monitor())
+  monitor.io.fromMainPipe <> mainPipe.io.toMonitor
 }
diff --git a/src/main/scala/coupledL2/debug/Monitor.scala b/src/main/scala/coupledL2/debug/Monitor.scala
index f62ab41b6..72518cee5 100644
--- a/src/main/scala/coupledL2/debug/Monitor.scala
+++ b/src/main/scala/coupledL2/debug/Monitor.scala
@@ -86,7 +86,7 @@ class Monitor(implicit p: Parameters) extends L2Module {
 
   /* ======== ChiselDB ======== */
 //  assert(cacheParams.hartIds.length == 1, "private L2 should have one and only one hardId")
-  if (!cacheParams.FPGAPlatform) {
+  if (cacheParams.enableMonitor && !cacheParams.FPGAPlatform) {
     val hartId = if (cacheParams.hartIds.length == 1) cacheParams.hartIds.head else 0
     val table = ChiselDB.createTable(s"L2MP", new CPL2S3Info, basicDB = true)
     val s3Info = Wire(new CPL2S3Info)
diff --git a/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala b/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala
index e3c5046f5..ab4e4fda8 100644
--- a/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala
+++ b/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala
@@ -136,7 +136,7 @@ object XSPerfRolling {
     clock: Clock,
     reset: Reset
   ): Unit = {
-    if (params.enablePerf && !params.FPGAPlatform) {
+    if (params.enableRollingDB && !params.FPGAPlatform) {
       val tableName = perfName + "_rolling_0"  // TODO: support naming hart id
       val rollingTable = ChiselDB.createTable(tableName, new RollingEntry(), basicDB=true)
 
@@ -168,7 +168,7 @@ object XSPerfRolling {
     clock: Clock,
     reset: Reset
   ): Unit = {
-    if (params.enablePerf && !params.FPGAPlatform) {
+    if (params.enableRollingDB && !params.FPGAPlatform) {
       val tableName = perfName + "_rolling_0"  // TODO: support naming hart id
       val rollingTable = ChiselDB.createTable(tableName, new RollingEntry(), basicDB=true)
 

From a6b668e2f81dcc2705f05ac29664c009f98ce02b Mon Sep 17 00:00:00 2001
From: Luoshan Cai <60723329+cailuoshan@users.noreply.github.com>
Date: Tue, 26 Mar 2024 15:34:15 +0800
Subject: [PATCH 2/9] Fix some bugs for CPL2 (#99)

* SinkC: fix bug for regs Buf not init

* MSHR: fix bug when L1_acquirePerm but L2_miss, L2 should acquireBlock to L3, not only acquirePerm

* MainPipe: when L3_probetoB and L2=TIP, L2 donot need probetoB L1

* SinkB: cannot accept Probe when same-addr Release to L3 and have not receive ReleaseAck

---------

Co-authored-by: Cai Luoshan <cailuoshan18@mails.ucas.ac.cn>
---
 src/main/scala/coupledL2/Common.scala   | 1 +
 src/main/scala/coupledL2/MSHR.scala     | 3 ++-
 src/main/scala/coupledL2/MainPipe.scala | 2 +-
 src/main/scala/coupledL2/SinkB.scala    | 4 ++--
 src/main/scala/coupledL2/SinkC.scala    | 4 ++--
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/main/scala/coupledL2/Common.scala b/src/main/scala/coupledL2/Common.scala
index 3d9c9d0af..e87fcf9d7 100644
--- a/src/main/scala/coupledL2/Common.scala
+++ b/src/main/scala/coupledL2/Common.scala
@@ -185,6 +185,7 @@ class MSHRInfo(implicit p: Parameters) extends L2Bundle {
   val s_refill = Bool()
   val param = UInt(3.W)
   val mergeA = Bool() // whether the mshr already merge an acquire(avoid alias merge)
+  val w_releaseack = Bool()
 }
 
 class RespInfoBundle(implicit p: Parameters) extends L2Bundle {
diff --git a/src/main/scala/coupledL2/MSHR.scala b/src/main/scala/coupledL2/MSHR.scala
index ec7fc541b..fb7e344a0 100644
--- a/src/main/scala/coupledL2/MSHR.scala
+++ b/src/main/scala/coupledL2/MSHR.scala
@@ -125,7 +125,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
     oa.off := req.off
     oa.source := io.id
     oa.opcode := Mux(
-      req_acquirePerm,
+      req_acquirePerm && dirResult.hit,
       req.opcode,
       // Get or AcquireBlock
       AcquireBlock
@@ -548,6 +548,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
   io.msInfo.bits.s_refill := state.s_refill
   io.msInfo.bits.param := req.param
   io.msInfo.bits.mergeA := mergeA
+  io.msInfo.bits.w_releaseack := state.w_releaseack
 
   assert(!(c_resp.valid && !io.status.bits.w_c_resp))
   assert(!(d_resp.valid && !io.status.bits.w_d_resp))
diff --git a/src/main/scala/coupledL2/MainPipe.scala b/src/main/scala/coupledL2/MainPipe.scala
index 80e77ee26..e83f879cd 100644
--- a/src/main/scala/coupledL2/MainPipe.scala
+++ b/src/main/scala/coupledL2/MainPipe.scala
@@ -178,7 +178,7 @@ class MainPipe(implicit p: Parameters) extends L2Module {
   val need_mshr_s3_a = need_acquire_s3_a || need_probe_s3_a || cache_alias
   // For channel B reqs, alloc mshr when Probe hits in both self and client dir
   val need_mshr_s3_b = dirResult_s3.hit && req_s3.fromB &&
-    !(meta_s3.state === BRANCH && req_s3.param === toB) &&
+    !((meta_s3.state === BRANCH || meta_s3.state === TIP) && req_s3.param === toB) &&
     meta_has_clients_s3
 
   // For channel C reqs, Release will always hit on MainPipe, no need for MSHR
diff --git a/src/main/scala/coupledL2/SinkB.scala b/src/main/scala/coupledL2/SinkB.scala
index 158e341bf..d68b6778d 100644
--- a/src/main/scala/coupledL2/SinkB.scala
+++ b/src/main/scala/coupledL2/SinkB.scala
@@ -75,9 +75,9 @@ class SinkB(implicit p: Parameters) extends L2Module {
     s.valid && s.bits.set === task.set && s.bits.reqTag === task.tag && !s.bits.willFree && !s.bits.nestB
   )).asUInt.orR
 
-  // unable to accept incoming B req because same-addr as some MSHR replaced block and cannot nest
+  // unable to accept incoming B req because same-addr Release to L3 and have not received ReleaseAck, and some MSHR replaced block and cannot nest
   val replaceConflictMask = VecInit(io.msInfo.map(s =>
-    s.valid && s.bits.set === task.set && s.bits.metaTag === task.tag && s.bits.blockRefill
+    s.valid && s.bits.set === task.set && s.bits.metaTag === task.tag && s.bits.blockRefill && !s.bits.w_releaseack
   )).asUInt
   val replaceConflict = replaceConflictMask.orR
 
diff --git a/src/main/scala/coupledL2/SinkC.scala b/src/main/scala/coupledL2/SinkC.scala
index f7e4e4847..25fb58d73 100644
--- a/src/main/scala/coupledL2/SinkC.scala
+++ b/src/main/scala/coupledL2/SinkC.scala
@@ -50,10 +50,10 @@ class SinkC(implicit p: Parameters) extends L2Module {
 
   // dataBuf entry is valid when Release has data
   // taskBuf entry is valid when ReqArb is not ready to receive C tasks
-  val dataBuf = Reg(Vec(bufBlocks, Vec(beatSize, UInt((beatBytes * 8).W))))
+  val dataBuf = RegInit(VecInit(Seq.fill(bufBlocks)(VecInit(Seq.fill(beatSize)(0.U.asTypeOf(UInt((beatBytes * 8).W)))))))
   val beatValids = RegInit(VecInit(Seq.fill(bufBlocks)(VecInit(Seq.fill(beatSize)(false.B)))))
   val dataValids = VecInit(beatValids.map(_.asUInt.orR)).asUInt
-  val taskBuf = Reg(Vec(bufBlocks, new TaskBundle))
+  val taskBuf = RegInit(VecInit(Seq.fill(bufBlocks)(0.U.asTypeOf(new TaskBundle))))
   val taskValids = RegInit(VecInit(Seq.fill(bufBlocks)(false.B)))
   val taskArb = Module(new RRArbiter(new TaskBundle, bufBlocks))
   val bufValids = taskValids.asUInt | dataValids

From ab7d455810522816833f993a69eba3676fe4b6cc Mon Sep 17 00:00:00 2001
From: Chen Xi <48302201+Ivyfeather@users.noreply.github.com>
Date: Fri, 29 Mar 2024 14:48:41 +0800
Subject: [PATCH 3/9] New hint design (#101)

* ReqArb: only give s1 info when s1-to-s2 fire

* Backbone: new structure for Hint
now we send Hint @s1 for every MSHR-GrantData passing by
   and send Hint @s3 for every  CHN-GrantData passing by

* CoupledL2: update logic for Hint Arb

* misc: fix connection

* Hint: consider new feature ** AMergeTask **

* Hint: fix Hint Arb among Slices

* Hint: add keyword info
---
 src/main/scala/coupledL2/CoupledL2.scala    | 121 +++++----
 src/main/scala/coupledL2/CustomL1Hint.scala | 256 ++++++--------------
 src/main/scala/coupledL2/GrantBuffer.scala  |  43 +---
 src/main/scala/coupledL2/MainPipe.scala     |  27 +--
 src/main/scala/coupledL2/RequestArb.scala   |   8 +-
 src/main/scala/coupledL2/Slice.scala        |   3 +-
 src/main/scala/coupledL2/SourceC.scala      |   2 +-
 7 files changed, 171 insertions(+), 289 deletions(-)

diff --git a/src/main/scala/coupledL2/CoupledL2.scala b/src/main/scala/coupledL2/CoupledL2.scala
index 7ef75339e..94f99b75b 100644
--- a/src/main/scala/coupledL2/CoupledL2.scala
+++ b/src/main/scala/coupledL2/CoupledL2.scala
@@ -21,7 +21,7 @@ package coupledL2
 
 import chisel3._
 import chisel3.util._
-import utility.{FastArbiter, Pipeline}
+import utility.{FastArbiter, ParallelMax, ParallelPriorityMux, Pipeline, RegNextN}
 import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.tilelink._
 import freechips.rocketchip.tilelink.TLMessages._
@@ -327,16 +327,19 @@ class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Para
       if(bankBits == 0) true.B else set(bankBits - 1, 0) === bankId.U
     }
 
-    def RegNextN[T <: Data](data: T, n: Int): T = {
-      if(n == 1)
-        RegNext(data)
-      else
-        RegNextN(data, n - 1)
-    }
+    // ** WARNING:TODO: this depends on where the latch is
+    // ** if Hint latched in slice, while D-Channel latched in XSTile
+    // ** we need only [hintCycleAhead - 1] later
+    val sliceAhead = hintCycleAhead - 1
+
+    val hintChosen = Wire(UInt(banks.W))
+    val hintFire = Wire(Bool())
 
-    val hint_chosen = Wire(UInt(node.in.size.W))
-    val hint_fire = Wire(Bool())
-    val release_sourceD_condition = Wire(Vec(node.in.size, Bool()))
+    // if Hint indicates that this slice should fireD, yet no D resp comes out of this slice
+    // then we releaseSourceD, enabling io.d.ready for other slices
+    // TODO: if Hint for single slice is 100% accurate, may consider remove this
+    val releaseSourceD = Wire(Vec(banks, Bool()))
+    val allCanFire = (RegNextN(!hintFire, sliceAhead) && RegNextN(!hintFire, sliceAhead + 1)) || Cat(releaseSourceD).orR
 
     val slices = node.in.zip(node.out).zipWithIndex.map {
       case (((in, edgeIn), (out, edgeOut)), i) =>
@@ -350,16 +353,21 @@ class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Para
             case SliceIdKey => i
           }))
         }
-        val sourceD_can_go = RegNextN(!hint_fire || i.U === OHToUInt(hint_chosen), hintCycleAhead - 1)
-        release_sourceD_condition(i) := sourceD_can_go && !slice.io.in.d.valid
         slice.io.in <> in
-        if(enableHintGuidedGrant) {
-          // If the hint of slice X is selected in T cycle, then in T + 3 cycle we will try our best to select the grant of slice X.
-          // If slice X has no grant in T + 3 cycle, it means that the hint of T cycle is wrong, so relax the restriction on grant selection.
-          // Timing will be worse if enabled
-          in.d.valid := slice.io.in.d.valid && (sourceD_can_go || Cat(release_sourceD_condition).orR)
-          slice.io.in.d.ready := in.d.ready && (sourceD_can_go || Cat(release_sourceD_condition).orR)
+        if (enableHintGuidedGrant) {
+          // If the hint of slice X is selected at cycle T, then at cycle (T + 3) & (T + 4)
+          // we will try our best to select the grant of slice X.
+          // If slice X has no grant then, it means that the hint at cycle T is wrong,
+          // so we relax the restriction on grant selection.
+          val sliceCanFire = RegNextN(hintFire && i.U === hintChosen, sliceAhead) ||
+            RegNextN(hintFire && i.U === hintChosen, sliceAhead + 1)
+
+          releaseSourceD(i) := sliceCanFire && !slice.io.in.d.valid
+
+          in.d.valid := slice.io.in.d.valid && (sliceCanFire || allCanFire)
+          slice.io.in.d.ready := in.d.ready && (sliceCanFire || allCanFire)
         }
+
         in.b.bits.address := restoreAddress(slice.io.in.b.bits.address, i)
         out <> slice.io.out
         out.a.bits.address := restoreAddress(slice.io.out.a.bits.address, i)
@@ -394,25 +402,33 @@ class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Para
 
         slice
     }
-    val l1Hint_arb = Module(new Arbiter(new L2ToL1Hint, slices.size))
-    val slices_l1Hint = slices.zipWithIndex.map {
-      case (s, i) => Pipeline(s.io.l1Hint, depth = 1, pipe = false, name = Some(s"l1Hint_buffer_$i"))
+
+    if(enableHintGuidedGrant) {
+      // for timing consideration, hint should latch one cycle before sending to L1
+      // instead of adding a Pipeline/Queue to latch here, we just set hintQueue in GrantBuf & CustomL1Hint "flow=false"
+      val l1HintArb = Module(new Arbiter(new L2ToL1Hint(), slices.size))
+      val slices_l1Hint = slices.zipWithIndex.map {
+        case (s, i) => s.io.l1Hint
+      }
+      // should only Hint for DCache
+      val (sourceIsDcache, dcacheSourceIdStart) = node.in.head._2.client.clients
+        .filter(_.supports.probe)
+        .map(c => {
+          (c.sourceId.contains(l1HintArb.io.out.bits.sourceId).asInstanceOf[Bool], c.sourceId.start.U)
+        }).head
+
+      l1HintArb.io.in <> VecInit(slices_l1Hint)
+      io.l2_hint.valid := l1HintArb.io.out.fire && sourceIsDcache
+      io.l2_hint.bits.sourceId := l1HintArb.io.out.bits.sourceId - dcacheSourceIdStart
+      io.l2_hint.bits.isKeyword := l1HintArb.io.out.bits.isKeyword
+      // continuous hints can only be sent every two cycle, since GrantData takes two cycles
+      l1HintArb.io.out.ready := !RegNext(io.l2_hint.valid, false.B)
+
+      hintChosen := l1HintArb.io.chosen // ! THIS IS NOT ONE-HOT !
+      hintFire := io.l2_hint.valid
     }
-    val (client_sourceId_match_oh, client_sourceId_start) = node.in.head._2.client.clients
-                                                          .map(c => {
-                                                                (c.sourceId.contains(l1Hint_arb.io.out.bits.sourceId).asInstanceOf[Bool], c.sourceId.start.U)
-                                                              })
-                                                          .unzip
-    l1Hint_arb.io.in <> VecInit(slices_l1Hint)
-    io.l2_hint.valid := l1Hint_arb.io.out.fire
-    io.l2_hint.bits.sourceId := l1Hint_arb.io.out.bits.sourceId - Mux1H(client_sourceId_match_oh, client_sourceId_start)
-    io.l2_hint.bits.isKeyword := l1Hint_arb.io.out.bits.isKeyword
-    // always ready for grant hint
-    l1Hint_arb.io.out.ready := true.B
-
-    hint_chosen := l1Hint_arb.io.chosen
-    hint_fire := io.l2_hint.valid
 
+    // ==================== TopDown ====================
     val topDown = topDownOpt.map(_ => Module(new TopDownMonitor()(p.alterPartial {
       case EdgeInKey => node.in.head._2
       case EdgeOutKey => node.out.head._2
@@ -433,12 +449,35 @@ class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Para
       case None => io.debugTopDown.l2MissMatch.foreach(_ := false.B)
     }
 
-    XSPerfAccumulate(cacheParams, "hint_fire", io.l2_hint.valid)
-    val grant_fire = slices.map{ slice => {
-                        val (_, _, grant_fire_last, _) = node.in.head._2.count(slice.io.in.d)
-                        slice.io.in.d.fire && grant_fire_last && slice.io.in.d.bits.opcode === GrantData
-                      }}
-    XSPerfAccumulate(cacheParams, "grant_data_fire", PopCount(VecInit(grant_fire)))
+    // ==================== XSPerf Counters ====================
+    val grant_data_fire = slices.map { slice => {
+      val (first, _, _, _) = node.in.head._2.count(slice.io.in.d)
+      slice.io.in.d.fire && first && slice.io.in.d.bits.opcode === GrantData
+    }
+    }
+    XSPerfAccumulate(cacheParams, "grant_data_fire", PopCount(VecInit(grant_data_fire)))
+
+    val hint_source = io.l2_hint.bits.sourceId
+
+    val grant_data_source = ParallelPriorityMux(slices.map {
+      s => (s.io.in.d.fire, s.io.in.d.bits.source)
+    })
+
+    val hintPipe2 = Module(new Pipeline(UInt(32.W), 2))
+    hintPipe2.io.in.valid := io.l2_hint.valid
+    hintPipe2.io.in.bits := hint_source
+    hintPipe2.io.out.ready := true.B
+
+    val hintPipe1 = Module(new Pipeline(UInt(32.W), 1))
+    hintPipe1.io.in.valid := io.l2_hint.valid
+    hintPipe1.io.in.bits := hint_source
+    hintPipe1.io.out.ready := true.B
+
+    val accurateHint = grant_data_fire.orR && hintPipe2.io.out.valid && hintPipe2.io.out.bits === grant_data_source
+    XSPerfAccumulate(cacheParams, "accurate3Hints", accurateHint)
+
+    val okHint = grant_data_fire.orR && hintPipe1.io.out.valid && hintPipe1.io.out.bits === grant_data_source
+    XSPerfAccumulate(cacheParams, "ok2Hints", okHint)
   }
 
   lazy val module = new CoupledL2Imp(this)
diff --git a/src/main/scala/coupledL2/CustomL1Hint.scala b/src/main/scala/coupledL2/CustomL1Hint.scala
index e2512bf07..3cd350f31 100644
--- a/src/main/scala/coupledL2/CustomL1Hint.scala
+++ b/src/main/scala/coupledL2/CustomL1Hint.scala
@@ -24,205 +24,99 @@ import org.chipsalliance.cde.config.Parameters
 import freechips.rocketchip.tilelink.TLMessages._
 import coupledL2.utils._
 
+class HintQueueEntry(implicit p: Parameters) extends L2Bundle {
+  val source = UInt(sourceIdBits.W)
+  val opcode = UInt(3.W)
+  val isKeyword = Bool()
+}
+
 class CustomL1HintIOBundle(implicit p: Parameters) extends L2Bundle {
   // input information
   val s1 = Flipped(ValidIO(new TaskBundle()))
-  val s2 = Flipped(ValidIO(new TaskBundle()))
   val s3 = new L2Bundle {
       val task      = Flipped(ValidIO(new TaskBundle()))
-      val d         = Input(Bool())
       val need_mshr = Input(Bool())
   }
-  val s4 = new L2Bundle {
-      val task                  = Flipped(ValidIO(new TaskBundle()))
-      val d                     = Input(Bool())
-      val need_write_releaseBuf = Input(Bool())
-  }
-  val s5 = new L2Bundle {
-      val task = Flipped(ValidIO(new TaskBundle()))
-      val d    = Input(Bool())
-  }
-  val globalCounter   = Input(UInt((log2Ceil(mshrsAll) + 1).W))
-  val grantBufferHint = Flipped(ValidIO(new L2ToL1Hint()))
 
   // output hint
-  val l1Hint = ValidIO(new L2ToL1Hint())
+  val l1Hint = DecoupledIO(new L2ToL1Hint())
 }
 
-
 // grantData hint interface
 // use this interface to give a hint to l1 before actually sending a GrantData
 class CustomL1Hint(implicit p: Parameters) extends L2Module {
   val io = IO(new CustomL1HintIOBundle)
 
   val task_s1 = io.s1
-  val task_s2 = io.s2
   val task_s3 = io.s3.task
-  val task_s4 = io.s4.task
-  val task_s5 = io.s5.task
-
-  val d_s3 = io.s3.d
-  val d_s4 = io.s4.d
-  val d_s5 = io.s5.d
-
-  require(hintCycleAhead <= 3)
-
-  // only use lower 2 bits of io.globalCounter to make timing happy
-  // as main pipeline will not trigger hint if io.globalCounter >= hintCycleAhead
-  val globalCounter   = io.globalCounter(1, 0)
-  val grantBufferHint = io.grantBufferHint
-
-  val impossible_pipe_hint = io.globalCounter >= hintCycleAhead.U
-
-  val mshr_req_s3  = task_s3.bits.mshrTask
+  val mshrReq_s1 = task_s1.bits.mshrTask
+  val mshrReq_s3 = task_s3.bits.mshrTask
+  val mergeA_s1  = task_s1.bits.mergeA
   val need_mshr_s3 = io.s3.need_mshr
 
-  val need_write_releaseBuf_s4 = io.s4.need_write_releaseBuf
-
-  //  req_grantbuffer_next_cycle_s4: this **hit** req will request grantBuffer in S5
-  val req_grantbuffer_next_cycle_s4 = !need_write_releaseBuf_s4
-
-  val s3_l2_hit_grant_data = task_s3.valid && !mshr_req_s3 && !need_mshr_s3 && task_s3.bits.fromA && task_s3.bits.opcode === AcquireBlock && !task_s3.bits.fromL2pft.getOrElse(false.B)
-  val s4_l2_hit_grant_data = task_s4.valid && req_grantbuffer_next_cycle_s4 && task_s4.bits.opcode === GrantData && task_s4.bits.fromA && !task_s4.bits.mshrTask && !task_s4.bits.fromL2pft.getOrElse(false.B)
-
-  val hint_s1, hint_s2, hint_s3, hint_s4, hint_s5 = Wire(io.l1Hint.cloneType)
-
-  // S1 hint
-  //    * l1 acquire and l2 miss situation, **no hit situation**
-  val s1_l2_miss_refill_grant_data    = task_s1.valid && task_s1.bits.fromA && task_s1.bits.opcode === GrantData
-  val s1_l2_miss_refill_counter_match = Wire(Bool())
-
-  // TODO: generalization, for now, only fit hintCycleAhead == 3
-  s1_l2_miss_refill_counter_match := PopCount(Seq(d_s3, d_s4, d_s5, s3_l2_hit_grant_data, s4_l2_hit_grant_data, task_s2.valid && task_s2.bits.fromA)) === 0.U && globalCounter <= 2.U
-  val dummy_s1_valid = if(hintCycleAhead == 3) s1_l2_miss_refill_grant_data && s1_l2_miss_refill_counter_match else false.B
-
-  hint_s1.valid          := dummy_s1_valid && !impossible_pipe_hint
-  hint_s1.bits.sourceId  := task_s1.bits.sourceId
-  // hint_s1.bits.isKeyword := task_s1.bits.isKeyword
-  hint_s1.bits.isKeyword := task_s1.bits.isKeyword.getOrElse(false.B)
-
-  // S2 hint
-  //    * l1 acquire and l2 miss situation, **no hit situation**
-  val s2_l2_miss_refill_grant_data    = task_s2.valid && task_s2.bits.fromA && task_s2.bits.opcode === GrantData && task_s2.bits.mshrTask
-  val s2_l2_miss_refill_counter_match = Wire(Bool())
-
-  // TODO: generalization, for now, only fit hintCycleAhead == 2
-  //  s2_l2_miss_refill_counter_match := PopCount(Seq(d_s3, d_s4, d_s5, s3_l2_hit_grant_data, s4_l2_hit_grant_data)) === 0.U && globalCounter === 0.U
-
-  s2_l2_miss_refill_counter_match := MuxLookup(Cat(d_s3, d_s4, d_s5), false.B)(Seq(
-    Cat(true.B, true.B, true.B)    -> ((globalCounter + 4.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0) + task_s3.bits.opcode(0)) === hintCycleAhead.U),
-    Cat(true.B, true.B, false.B)   -> ((globalCounter + 3.U + task_s4.bits.opcode(0) + task_s3.bits.opcode(0)) === hintCycleAhead.U),
-    Cat(true.B, false.B, true.B)   -> Mux(s4_l2_hit_grant_data, (globalCounter + 4.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0) + task_s3.bits.opcode(0)) === hintCycleAhead.U,
-                                                                (globalCounter + 3.U + task_s5.bits.opcode(0) + task_s3.bits.opcode(0)) === hintCycleAhead.U),
-    Cat(false.B, true.B, true.B)   -> Mux(s3_l2_hit_grant_data, (globalCounter + 4.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0) + task_s3.bits.opcode(0)) === hintCycleAhead.U,
-                                                                (globalCounter + 3.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0)) === hintCycleAhead.U),
-    Cat(true.B, false.B, false.B)  -> Mux(s4_l2_hit_grant_data, (globalCounter + 3.U + task_s4.bits.opcode(0) + task_s3.bits.opcode(0)) === hintCycleAhead.U,
-                                                                (globalCounter + 2.U + task_s3.bits.opcode(0)) === hintCycleAhead.U),
-    Cat(false.B, true.B, false.B)  -> ((globalCounter + 2.U + task_s4.bits.opcode(0)) === hintCycleAhead.U),
-    Cat(false.B, false.B, true.B)  -> Mux(s4_l2_hit_grant_data,
-                                          Mux(s3_l2_hit_grant_data, (globalCounter + 4.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0) + task_s3.bits.opcode(0)) === hintCycleAhead.U,
-                                                (globalCounter + 3.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0)) === hintCycleAhead.U),
-                                                    (globalCounter + 2.U + task_s5.bits.opcode(0)) === hintCycleAhead.U),
-    Cat(false.B, false.B, false.B) -> Mux(s4_l2_hit_grant_data,
-                                          Mux(s3_l2_hit_grant_data, (globalCounter + 4.U + task_s4.bits.opcode(0) + task_s3.bits.opcode(0)) === hintCycleAhead.U,
-                                                (globalCounter + 3.U + task_s4.bits.opcode(0)) === hintCycleAhead.U),
-                                                    (globalCounter + 2.U) === hintCycleAhead.U)
-  ))
-
-  hint_s2.valid         := s2_l2_miss_refill_grant_data && s2_l2_miss_refill_counter_match && !impossible_pipe_hint
-  hint_s2.bits.sourceId := task_s2.bits.sourceId
-  hint_s2.bits.isKeyword := task_s2.bits.isKeyword.getOrElse(false.B)
-
-  // S3 hint
-  //    * l1 acquire and l2 hit situation
-  val s3_l2_hit_counter_match = Wire(Bool())
-  when(d_s5 && d_s4) {
-    s3_l2_hit_counter_match := (globalCounter + 3.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0)) === hintCycleAhead.U
-  }.elsewhen(d_s4) {
-    s3_l2_hit_counter_match := (globalCounter + 3.U + task_s4.bits.opcode(0)) === hintCycleAhead.U
-  }.elsewhen(d_s5) {
-    // NOTE: if s4 is a hit grantData, it will not request grantBuffer in s4, but in s5
-    when(s4_l2_hit_grant_data) {
-      s3_l2_hit_counter_match := (globalCounter + 3.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0)) === hintCycleAhead.U
-    }.otherwise {
-      s3_l2_hit_counter_match := (globalCounter + 3.U + task_s5.bits.opcode(0)) === hintCycleAhead.U
-    }
-  }.otherwise {
-    when(s4_l2_hit_grant_data) {
-      s3_l2_hit_counter_match := (globalCounter + 3.U + task_s4.bits.opcode(0)) === hintCycleAhead.U
-    }.otherwise {
-      s3_l2_hit_counter_match :=  globalCounter + 3.U === hintCycleAhead.U
-    }
-  }
-  val validHint_s3 = s3_l2_hit_grant_data && s3_l2_hit_counter_match
-
-  // S3 hint
-  //    * l1 acquire and l2 miss situation
-  val s3_l2_miss_refill_grant_data    = d_s3 && mshr_req_s3 && task_s3.bits.fromA && task_s3.bits.opcode === GrantData && !task_s3.bits.fromL2pft.getOrElse(false.B)
-  val s3_l2_miss_refill_counter_match = Wire(Bool())
-  when(d_s5 && d_s4) {
-    s3_l2_miss_refill_counter_match := (globalCounter + 3.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0)) === hintCycleAhead.U
-  }.elsewhen(d_s4) {
-    s3_l2_miss_refill_counter_match := (globalCounter + 2.U + task_s4.bits.opcode(0)) === hintCycleAhead.U
-  }.elsewhen(d_s5) {
-    when(s4_l2_hit_grant_data) {
-      s3_l2_miss_refill_counter_match := (globalCounter + 3.U + task_s5.bits.opcode(0) + task_s4.bits.opcode(0)) === hintCycleAhead.U
-    }.otherwise {
-      s3_l2_miss_refill_counter_match := (globalCounter + 2.U + task_s5.bits.opcode(0)) === hintCycleAhead.U
-    }
-  }.otherwise {
-    s3_l2_miss_refill_counter_match :=  globalCounter + 1.U === hintCycleAhead.U
-  }
-  val validHintMiss_s3 = s3_l2_miss_refill_grant_data && s3_l2_miss_refill_counter_match
-
-  hint_s3.valid         := (validHint_s3 || validHintMiss_s3) && !impossible_pipe_hint
-  hint_s3.bits.sourceId := task_s3.bits.sourceId
-  hint_s3.bits.isKeyword:= task_s3.bits.isKeyword.getOrElse(false.B)
-
-  // S4 hint
-  //    * l1 acquire and l2 hit situation
-  val s4_l2_hit_counter_match = Mux(d_s5 && task_s5.bits.opcode(0), (globalCounter + 3.U) === hintCycleAhead.U,
-                                      (globalCounter + 2.U) === hintCycleAhead.U )
-  val validHint_s4            = s4_l2_hit_grant_data && s4_l2_hit_counter_match
-  // S4 hint
-  //    * l1 acquire and l2 miss situation
-  val s4_l2_miss_refill_grant_data    = d_s4 && task_s4.bits.opcode === GrantData && task_s4.bits.fromA && task_s4.bits.mshrTask && !task_s4.bits.fromL2pft.getOrElse(false.B)
-  val s4_l2_miss_refill_counter_match = Mux(d_s5 && task_s5.bits.opcode(0), (globalCounter + 3.U) === hintCycleAhead.U,
-                                            Mux(d_s5 && !task_s5.bits.opcode(0), (globalCounter + 2.U) === hintCycleAhead.U,
-                                                (globalCounter + 1.U) === hintCycleAhead.U ))
-  val validHintMiss_s4 = s4_l2_miss_refill_grant_data && s4_l2_miss_refill_counter_match
-
-  hint_s4.valid         := (validHint_s4 || validHintMiss_s4) && !impossible_pipe_hint
-  hint_s4.bits.sourceId := task_s4.bits.sourceId
-  hint_s4.bits.isKeyword := task_s4.bits.isKeyword.getOrElse(false.B)
-  // S5 hint
-  //    * l1 acquire and l2 hit situation
-  val validHint_s5 = d_s5 && task_s5.bits.opcode === GrantData && task_s5.bits.fromA && !task_s5.bits.mshrTask && ((globalCounter + 1.U) === hintCycleAhead.U) && !task_s5.bits.fromL2pft.getOrElse(false.B)
-  // S5 hint
-  //    * l1 acquire and l2 miss situation
-  val validHintMiss_s5 = d_s5 && task_s5.bits.opcode === GrantData && task_s5.bits.fromA && task_s5.bits.mshrTask && ((globalCounter + 1.U) === hintCycleAhead.U) && !task_s5.bits.fromL2pft.getOrElse(false.B)
-
-  hint_s5.valid         := (validHint_s5 || validHintMiss_s5) && !impossible_pipe_hint
-  hint_s5.bits.sourceId := task_s5.bits.sourceId
-  hint_s5.bits.isKeyword := task_s5.bits.isKeyword.getOrElse(false.B)
-
-  val hint_valid     = Seq(grantBufferHint.valid,          hint_s1.valid,          hint_s2.valid,          hint_s3.valid,          hint_s4.valid,          hint_s5.valid)
-  val hint_sourceId  = Seq(grantBufferHint.bits.sourceId,  hint_s1.bits.sourceId,  hint_s2.bits.sourceId,  hint_s3.bits.sourceId,  hint_s4.bits.sourceId,  hint_s5.bits.sourceId)
-  val hint_isKeyword = Seq(grantBufferHint.bits.isKeyword, hint_s1.bits.isKeyword, hint_s2.bits.isKeyword, hint_s3.bits.isKeyword, hint_s4.bits.isKeyword, hint_s5.bits.isKeyword)
-
-  io.l1Hint.valid          := VecInit(hint_valid).asUInt.orR
-  io.l1Hint.bits.sourceId  := ParallelMux(hint_valid zip hint_sourceId)
-  io.l1Hint.bits.isKeyword := ParallelMux(hint_valid zip hint_isKeyword)
-
-  // TODO: open this assert when hint is really correct for all situations
-  // assert(PopCount(VecInit(hint_valid)) <= 1.U)
-
-  XSPerfAccumulate(cacheParams, "hint_grantBufferHint_valid", grantBufferHint.valid)
-  XSPerfAccumulate(cacheParams, "hint_s1_valid", hint_s1.valid)
-  XSPerfAccumulate(cacheParams, "hint_s2_valid", hint_s2.valid)
-  XSPerfAccumulate(cacheParams, "hint_s3_valid", hint_s3.valid)
-  XSPerfAccumulate(cacheParams, "hint_s4_valid", hint_s4.valid)
-  XSPerfAccumulate(cacheParams, "hint_s5_valid", hint_s5.valid)
-  XSPerfAccumulate(cacheParams, "incorrect_hint", PopCount(VecInit(hint_valid)) > 1.U)
-
+  def isGrantData(t: TaskBundle):  Bool = t.fromA && t.opcode === GrantData
+  def isGrant(t: TaskBundle):      Bool = t.fromA && t.opcode === Grant
+  def isHintAck(t: TaskBundle):    Bool = t.fromA && t.opcode === HintAck // HintAck has no effect on Hint
+  def isRelease(t: TaskBundle):    Bool = t.fromC && (t.opcode === Release || t.opcode === ReleaseData)
+  def isMergeGrantData(t: TaskBundle): Bool = t.fromA && t.mergeA && t.aMergeTask.opcode === GrantData
+  def isMergeGrant(t: TaskBundle):     Bool = t.fromA && t.mergeA && t.aMergeTask.opcode === Grant
+
+  // ==================== Hint Generation ====================
+  // Hint for "MSHRTask and ReleaseAck" will fire@s1
+  val mshr_GrantData_s1 = task_s1.valid &&  mshrReq_s1 && (isGrantData(task_s1.bits) || isMergeGrantData(task_s1.bits))
+  val mshr_Grant_s1     = task_s1.valid &&  mshrReq_s1 && (isGrant(task_s1.bits) || isMergeGrant(task_s1.bits))
+  val chn_Release_s1    = task_s1.valid && !mshrReq_s1 && isRelease(task_s1.bits)
+
+  val enqValid_s1 = mshr_GrantData_s1 || mshr_Grant_s1 || chn_Release_s1
+  val enqSource_s1 = Mux(task_s1.bits.mergeA, task_s1.bits.aMergeTask.sourceId, task_s1.bits.sourceId)
+  val enqKeyWord_s1 = Mux(task_s1.bits.mergeA,
+    task_s1.bits.aMergeTask.isKeyword.getOrElse(false.B),
+    task_s1.bits.isKeyword.getOrElse(false.B)
+  )
+  val enqOpcode_s1 = ParallelPriorityMux(
+    Seq(
+      mshr_Grant_s1 -> Grant,
+      mshr_GrantData_s1 -> GrantData,
+      chn_Release_s1 -> ReleaseAck
+    )
+  )
+
+  // Hint for "chnTask Hit" will fire@s3
+  val chn_Grant_s3     = task_s3.valid && !mshrReq_s3 && !need_mshr_s3 && isGrant(task_s3.bits)
+  val chn_GrantData_s3 = task_s3.valid && !mshrReq_s3 && !need_mshr_s3 && isGrantData(task_s3.bits)
+  val enqValid_s3 = chn_Grant_s3 || chn_GrantData_s3
+  val enqSource_s3 = task_s3.bits.sourceId
+  val enqKeyWord_s3 = task_s3.bits.isKeyword.getOrElse(false.B)
+  val enqOpcode_s3 = ParallelPriorityMux(
+    Seq(
+      chn_Grant_s3 -> Grant,
+      chn_GrantData_s3 -> GrantData
+    )
+  )
+
+  // ==================== Hint Queue ====================
+  val hintEntries = mshrsAll
+  val hintEntriesWidth = log2Ceil(hintEntries)
+  val hintQueue = Module(new Queue(new HintQueueEntry, hintEntries))
+
+  // this will have at most 2 entries
+  val hint_s1Queue = Module(new Queue(new HintQueueEntry, 4, flow = true))
+  hint_s1Queue.io.enq.valid := enqValid_s1
+  hint_s1Queue.io.enq.bits.opcode := enqOpcode_s1
+  hint_s1Queue.io.enq.bits.source := enqSource_s1
+  hint_s1Queue.io.enq.bits.isKeyword := enqKeyWord_s1
+  hint_s1Queue.io.deq.ready := hintQueue.io.enq.ready && !enqValid_s3
+  // WARNING:TODO: ensure queue will never overflow
+  assert(hint_s1Queue.io.enq.ready, "hint_s1Queue should never be full")
+  assert(hintQueue.io.enq.ready, "hintQueue should never be full")
+
+  hintQueue.io.enq.valid := enqValid_s3 || hint_s1Queue.io.deq.valid
+  hintQueue.io.enq.bits.opcode := Mux(enqValid_s3, enqOpcode_s3, hint_s1Queue.io.deq.bits.opcode)
+  hintQueue.io.enq.bits.source := Mux(enqValid_s3, enqSource_s3, hint_s1Queue.io.deq.bits.source)
+  hintQueue.io.enq.bits.isKeyword := Mux(enqValid_s3, enqKeyWord_s3, hint_s1Queue.io.deq.bits.isKeyword)
+  hintQueue.io.deq.ready := io.l1Hint.ready
+
+  io.l1Hint.valid := hintQueue.io.deq.valid && hintQueue.io.deq.bits.opcode === GrantData
+  io.l1Hint.bits.sourceId := hintQueue.io.deq.bits.source
+  io.l1Hint.bits.isKeyword := hintQueue.io.deq.bits.isKeyword
 }
\ No newline at end of file
diff --git a/src/main/scala/coupledL2/GrantBuffer.scala b/src/main/scala/coupledL2/GrantBuffer.scala
index a18b1845b..c93602ec9 100644
--- a/src/main/scala/coupledL2/GrantBuffer.scala
+++ b/src/main/scala/coupledL2/GrantBuffer.scala
@@ -74,10 +74,6 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
 
     // to block sourceB from sending same-addr probe until GrantAck received
     val grantStatus = Output(Vec(grantBufInflightSize, new GrantStatus))
-
-    // generate hint signal for L1
-    val l1Hint = ValidIO(new L2ToL1Hint())
-    val globalCounter = Output(UInt((log2Ceil(mshrsAll) + 1).W))
   })
 
   // =========== functions ===========
@@ -297,43 +293,6 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
   io.toReqArb.blockSinkReqEntrance.blockG_s1 := false.B // this is not used
   io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq || noSpaceForMSHRPft.getOrElse(false.B)
 
-  // =========== generating Hint to L1 ===========
-  // TODO: the following keeps the exact same logic as before, but it needs serious optimization
-  val hintQueue = Module(new Queue(new L2ToL1Hint, entries = mshrsAll))
-  // Total number of beats left to send in GrantBuf
-  // [This is better]
-  // val globalCounter = (grantQueue.io.count << 1.U).asUInt + grantBufValid.asUInt // entries * 2 + grantBufValid
-  val globalCounter = RegInit(0.U((log2Ceil(grantBufSize) + 1).W))
-  when(io.d_task.fire) {
-    val hasData = io.d_task.bits.task.opcode(0)
-    when(hasData) {
-      globalCounter := globalCounter + 1.U // counter = counter + 2 - 1
-    }.otherwise {
-      globalCounter := globalCounter // counter = counter + 1 - 1
-    }
-  }.otherwise {
-    globalCounter := Mux(globalCounter === 0.U, 0.U, globalCounter - 1.U) // counter = counter - 1
-  }
-
-  // if globalCounter >= 3, it means the hint that should be sent is in GrantBuf
-  when(globalCounter >= 3.U) {
-    hintQueue.io.enq.valid := true.B
-    hintQueue.io.enq.bits.sourceId := io.d_task.bits.task.sourceId
-    hintQueue.io.enq.bits.isKeyword := Mux(io.d_task.bits.task.mergeA, io.d_task.bits.task.aMergeTask.isKeyword.getOrElse(false.B), io.d_task.bits.task.isKeyword.getOrElse(false.B))
-  }.otherwise {
-    hintQueue.io.enq.valid := false.B
-    hintQueue.io.enq.bits.sourceId := 0.U(sourceIdBits.W)
-    hintQueue.io.enq.bits.isKeyword := false.B
-  }
-  hintQueue.io.deq.ready := true.B
-
-  // tell CustomL1Hint about the delay in GrantBuf
-  io.globalCounter := globalCounter
-
-  io.l1Hint.valid := hintQueue.io.deq.valid
-  io.l1Hint.bits.sourceId := hintQueue.io.deq.bits.sourceId
-  io.l1Hint.bits.isKeyword := hintQueue.io.deq.bits.isKeyword
-
   // =========== XSPerf ===========
   if (cacheParams.enablePerf) {
     val timers = RegInit(VecInit(Seq.fill(grantBufInflightSize){0.U(64.W)}))
@@ -349,6 +308,6 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
     }
     // pftRespQueue is about to be full, and using back pressure to block All MainPipe Entrance
     // which can SERIOUSLY affect performance, should consider less drastic prefetch policy
-    XSPerfAccumulate(cacheParams, "WARNING_pftRespQueue_about_to_full", noSpaceForMSHRPft.getOrElse(false.B))
+    XSPerfAccumulate(cacheParams, "pftRespQueue_about_to_full", noSpaceForMSHRPft.getOrElse(false.B))
   }
 }
diff --git a/src/main/scala/coupledL2/MainPipe.scala b/src/main/scala/coupledL2/MainPipe.scala
index e83f879cd..309a41096 100644
--- a/src/main/scala/coupledL2/MainPipe.scala
+++ b/src/main/scala/coupledL2/MainPipe.scala
@@ -33,8 +33,6 @@ class MainPipe(implicit p: Parameters) extends L2Module {
   val io = IO(new Bundle() {
     /* receive task from arbiter at stage 2 */
     val taskFromArb_s2 = Flipped(ValidIO(new TaskBundle()))
-    /* status from arbiter at stage1  */
-    val taskInfo_s1 = Flipped(ValidIO(new TaskBundle()))
 
     /* handle set conflict in req arb */
     val fromReqArb = Input(new Bundle() {
@@ -94,9 +92,11 @@ class MainPipe(implicit p: Parameters) extends L2Module {
     val nestedwb = Output(new NestedWriteback)
     val nestedwbData = Output(new DSBlock)
 
-    val l1Hint = ValidIO(new L2ToL1Hint())
-    val grantBufferHint = Flipped(ValidIO(new L2ToL1Hint()))
-    val globalCounter = Input(UInt((log2Ceil(mshrsAll) + 1).W))
+    /* send Hint to L1 */
+    val l1Hint = DecoupledIO(new L2ToL1Hint())
+    /* receive s1 info for Hint */
+    val taskInfo_s1 = Flipped(ValidIO(new TaskBundle()))
+
     /* send prefetchTrain to Prefetch to trigger a prefetch req */
     val prefetchTrain = prefetchOpt.map(_ => DecoupledIO(new PrefetchTrain))
 
@@ -491,22 +491,12 @@ class MainPipe(implicit p: Parameters) extends L2Module {
   val customL1Hint = Module(new CustomL1Hint)
 
   customL1Hint.io.s1 := io.taskInfo_s1
-  customL1Hint.io.s2 := task_s2
-
+  
   customL1Hint.io.s3.task      := task_s3
-  customL1Hint.io.s3.d         := d_s3.valid
+  // overwrite opcode: if sinkReq can respond, use sink_resp_s3.bits.opcode = Grant/GrantData
+  customL1Hint.io.s3.task.bits.opcode := Mux(sink_resp_s3.valid, sink_resp_s3.bits.opcode, task_s3.bits.opcode)
   customL1Hint.io.s3.need_mshr := need_mshr_s3
 
-  customL1Hint.io.s4.task                  := task_s4
-  customL1Hint.io.s4.d                     := d_s4.valid
-  customL1Hint.io.s4.need_write_releaseBuf := need_write_releaseBuf_s4
-
-  customL1Hint.io.s5.task      := task_s5
-  customL1Hint.io.s5.d         := d_s5.valid
-
-  customL1Hint.io.globalCounter   := io.globalCounter
-  customL1Hint.io.grantBufferHint <> io.grantBufferHint
-
   customL1Hint.io.l1Hint <> io.l1Hint
 
   io.releaseBufWrite.valid      := task_s5.valid && need_write_releaseBuf_s5
@@ -620,7 +610,6 @@ class MainPipe(implicit p: Parameters) extends L2Module {
 
   val c = Seq(c_s5, c_s4, c_s3)
   val d = Seq(d_s5, d_s4, d_s3)
-  // DO NOT use TLArbiter because TLArbiter will send continuous beats for the same source
   val c_arb = Module(new Arbiter(io.toSourceC.bits.cloneType, c.size))
   val d_arb = Module(new Arbiter(io.toSourceD.bits.cloneType, d.size))
   c_arb.io.in <> c
diff --git a/src/main/scala/coupledL2/RequestArb.scala b/src/main/scala/coupledL2/RequestArb.scala
index 2f5b53ee6..e31860232 100644
--- a/src/main/scala/coupledL2/RequestArb.scala
+++ b/src/main/scala/coupledL2/RequestArb.scala
@@ -130,8 +130,10 @@ class RequestArb(implicit p: Parameters) extends L2Module {
   // mshr_task_s1 is s1_[reg]
   // task_s1 is [wire] to s2_reg
   val task_s1 = Mux(mshr_task_s1.valid, mshr_task_s1, chnl_task_s1)
+  val s1_to_s2_valid = task_s1.valid && !mshr_replRead_stall
 
-  io.taskInfo_s1 := mshr_task_s1
+  io.taskInfo_s1.valid := s1_to_s2_valid
+  io.taskInfo_s1.bits := task_s1.bits
 
   /* Meta read request */
   // ^ only sinkA/B/C tasks need to read directory
@@ -157,8 +159,8 @@ class RequestArb(implicit p: Parameters) extends L2Module {
 
   /* ========  Stage 2 ======== */
   val task_s2 = RegInit(0.U.asTypeOf(task_s1))
-  task_s2.valid := task_s1.valid && !mshr_replRead_stall
-  when(task_s1.valid && !mshr_replRead_stall) { task_s2.bits := task_s1.bits }
+  task_s2.valid := s1_to_s2_valid
+  when(s1_to_s2_valid) { task_s2.bits := task_s1.bits }
 
   io.taskToPipe_s2 := task_s2
 
diff --git a/src/main/scala/coupledL2/Slice.scala b/src/main/scala/coupledL2/Slice.scala
index a81a804f5..a525e1b4b 100644
--- a/src/main/scala/coupledL2/Slice.scala
+++ b/src/main/scala/coupledL2/Slice.scala
@@ -107,8 +107,6 @@ class Slice()(implicit p: Parameters) extends L2Module {
   mainPipe.io.releaseBufResp_s3.valid := RegNext(releaseBuf.io.r.valid, false.B)
   mainPipe.io.releaseBufResp_s3.bits := releaseBuf.io.resp.data
   mainPipe.io.fromReqArb.status_s1 := reqArb.io.status_s1
-  mainPipe.io.grantBufferHint := grantBuf.io.l1Hint
-  mainPipe.io.globalCounter := grantBuf.io.globalCounter
   mainPipe.io.taskInfo_s1 <> reqArb.io.taskInfo_s1
 
   // priority: nested-ReleaseData / probeAckData [NEW] > mainPipe DS rdata [OLD]
@@ -129,6 +127,7 @@ class Slice()(implicit p: Parameters) extends L2Module {
   io.l1Hint.valid := mainPipe.io.l1Hint.valid
   io.l1Hint.bits.sourceId := mainPipe.io.l1Hint.bits.sourceId
   io.l1Hint.bits.isKeyword := mainPipe.io.l1Hint.bits.isKeyword
+  mainPipe.io.l1Hint.ready := io.l1Hint.ready
   mshrCtl.io.grantStatus := grantBuf.io.grantStatus
 
   grantBuf.io.d_task <> mainPipe.io.toSourceD
diff --git a/src/main/scala/coupledL2/SourceC.scala b/src/main/scala/coupledL2/SourceC.scala
index d5cfed76d..25e043231 100644
--- a/src/main/scala/coupledL2/SourceC.scala
+++ b/src/main/scala/coupledL2/SourceC.scala
@@ -148,7 +148,7 @@ class SourceC(implicit p: Parameters) extends L2Module {
   io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq
 
   // dequeued task, the only, ready to fire
-  // WARNING: !it will reduce Release bandwidth to half! (though it is not critical)
+  // WARNING: !it will reduce Release bandwidth to half!!!
   // TODO: change it the same way as GrantBuf
   val beatValids = RegInit(VecInit(Seq.fill(beatSize)(false.B)))
   val taskValid = beatValids.asUInt.orR

From c974407d911debb213edd1087ff54bdaa6039c8a Mon Sep 17 00:00:00 2001
From: Luoshan Cai <60723329+cailuoshan@users.noreply.github.com>
Date: Tue, 2 Apr 2024 16:52:04 +0800
Subject: [PATCH 4/9] Directory: choose other free way when refill way has
 conflict mshr entry, instead of refillRetry (#103)

Co-authored-by: Cai Luoshan <cailuoshan18@mails.ucas.ac.cn>
---
 src/main/scala/coupledL2/Directory.scala | 39 +++++++++++++++---------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/coupledL2/Directory.scala b/src/main/scala/coupledL2/Directory.scala
index f89e67b34..a440f260c 100644
--- a/src/main/scala/coupledL2/Directory.scala
+++ b/src/main/scala/coupledL2/Directory.scala
@@ -182,17 +182,39 @@ class Directory(implicit p: Parameters) extends L2Module {
   val metaValidVec = metaAll_s3.map(_.state =/= MetaData.INVALID)
   val hitVec = tagMatchVec.zip(metaValidVec).map(x => x._1 && x._2)
 
+  /* ====== refill retry ====== */
+  // when refill, ways that have not finished writing its refillData back to DS (in MSHR Release),
+  // or using by Alias-Acquire (hit), can not be used for replace.
+  // choose free way to refill, if all ways are occupied, we cancel the Grant and LET IT RETRY
+  // compare is done at Stage2 for better timing
+  val occWayMask_s2 = VecInit(io.msInfo.map(s =>
+    Mux(
+      s.valid && (s.bits.set === req_s2.set) && (s.bits.blockRefill || s.bits.dirHit),
+      UIntToOH(s.bits.way, ways),
+      0.U(ways.W)
+    )
+  )).reduceTree(_ | _)
+  
+  val freeWayMask_s3 = RegEnable(~occWayMask_s2, refillReqValid_s2)
+  val refillRetry = !(freeWayMask_s3.orR)
+
   val hitWay = OHToUInt(hitVec)
   val replaceWay = WireInit(UInt(wayBits.W), 0.U)
   val (inv, invalidWay) = invalid_way_sel(metaAll_s3, replaceWay)
   val chosenWay = Mux(inv, invalidWay, replaceWay)
   // if chosenWay not in wayMask, then choose a way in wayMask
-  // TODO: consider remove this is not used for better timing
   // for retry bug fixing: if the chosenway cause retry last time, choose another way
-  val finalWay = Mux(
+  /*val finalWay = Mux(
     req_s3.wayMask(chosenWay),
     chosenWay,
     PriorityEncoder(req_s3.wayMask)
+  )*/
+  // for retry bug fixing: if the chosenway not in freewaymask, choose another way
+  // TODO: req_s3.wayMask not take into consideration
+  val finalWay = Mux(
+    freeWayMask_s3(chosenWay),
+    chosenWay,
+    PriorityEncoder(freeWayMask_s3)
   )
 
   val hit_s3 = Cat(hitVec).orR
@@ -217,19 +239,6 @@ class Directory(implicit p: Parameters) extends L2Module {
 
   io.read.ready := !io.metaWReq.valid && !io.tagWReq.valid && !replacerWen
 
-  /* ====== refill retry ====== */
-  // if refill chooses a way that has not finished writing its refillData back to DS (in MSHR Release),
-  // or the way is using by Alias-Acquire (hit), we cancel the Grant and LET IT RETRY
-
-  // comparing set is done at Stage2 for better timing
-  val wayConflictPartI  = RegEnable(VecInit(io.msInfo.map(s =>
-    s.valid && s.bits.set === req_s2.set)).asUInt, refillReqValid_s2)
-
-  val wayConflictPartII = VecInit(io.msInfo.map(s =>
-    (s.bits.blockRefill || s.bits.dirHit) && s.bits.way === finalWay
-  )).asUInt
-  val refillRetry = (wayConflictPartI & wayConflictPartII).orR
-
   /* ======!! Replacement logic !!====== */
   /* ====== Read, choose replaceWay ====== */
   val repl_state_s3 = if(random_repl) {

From cb50572f1d6781d9b0b422fa715d6e925e609712 Mon Sep 17 00:00:00 2001
From: Yangyu Chen <cyy@cyyself.name>
Date: Sun, 7 Apr 2024 17:03:02 +0800
Subject: [PATCH 5/9] configs: use hartid from io (#102)

* configs: use hartid from io

Using hartid from parameters will result in the module not being
deduplicated by chisel and firrtl. Each hart will produce its module in
verilog, making it hard for the rtl simulator to optimize.

Signed-off-by: Yangyu Chen <cyy@cyyself.name>

* huancun: bump huancun

Signed-off-by: Yangyu Chen <cyy@cyyself.name>

---------

Signed-off-by: Yangyu Chen <cyy@cyyself.name>
---
 HuanCun                                                  | 2 +-
 src/main/scala/coupledL2/CoupledL2.scala                 | 2 ++
 src/main/scala/coupledL2/prefetch/Prefetcher.scala       | 1 +
 src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala | 9 +++++----
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/HuanCun b/HuanCun
index fdd56a568..a69ea2cf1 160000
--- a/HuanCun
+++ b/HuanCun
@@ -1 +1 @@
-Subproject commit fdd56a56874b83f295bfb336a02d923342c933f6
+Subproject commit a69ea2cf169947ecf8f8df775aa7bd9744be3293
diff --git a/src/main/scala/coupledL2/CoupledL2.scala b/src/main/scala/coupledL2/CoupledL2.scala
index 94f99b75b..2e5700555 100644
--- a/src/main/scala/coupledL2/CoupledL2.scala
+++ b/src/main/scala/coupledL2/CoupledL2.scala
@@ -80,6 +80,8 @@ trait HasCoupledL2Parameters {
   lazy val msgSizeBits = edgeIn.bundle.sizeBits
   lazy val sourceIdAll = 1 << sourceIdBits
 
+  lazy val hartIdLen: Int = log2Up(cacheParams.hartIds.length)
+
   val mshrsAll = cacheParams.mshrs
   val idsAll = 256// ids of L2 //TODO: Paramterize like this: max(mshrsAll * 2, sourceIdAll * 2)
   val mshrBits = log2Up(idsAll)
diff --git a/src/main/scala/coupledL2/prefetch/Prefetcher.scala b/src/main/scala/coupledL2/prefetch/Prefetcher.scala
index c962fe798..84d0b0267 100644
--- a/src/main/scala/coupledL2/prefetch/Prefetcher.scala
+++ b/src/main/scala/coupledL2/prefetch/Prefetcher.scala
@@ -172,6 +172,7 @@ class Prefetcher(implicit p: Parameters) extends PrefetchModule {
       bop.io.resp <> io.resp
       tp.io.train <> io.train
       tp.io.resp <> io.resp
+      tp.io.hartid := tpio.tpmeta_port.get.req.bits.hartid
 
       // send to prq
       pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (bop.io.req.valid || tp.io.req.valid))
diff --git a/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala b/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala
index 19b068fa9..b0f3d181c 100644
--- a/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala
+++ b/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala
@@ -95,7 +95,7 @@ class sendBundle(implicit p: Parameters) extends TPBundle {
   val vaddr = UInt(vaddrBits.W)
 }
 
-class tpmetaPortIO() extends Bundle {
+class tpmetaPortIO(implicit p: Parameters) extends Bundle {
   val req = DecoupledIO(new TPmetaReq)
   val resp = Flipped(ValidIO(new TPmetaResp))
 }
@@ -107,6 +107,7 @@ class TemporalPrefetch(implicit p: Parameters) extends TPModule {
     val req = DecoupledIO(new PrefetchReq)
     val resp = Flipped(DecoupledIO(new PrefetchResp))
     val tpmeta_port = new tpmetaPortIO()
+    val hartid = Input(UInt(hartIdLen.W))
   })
 
   def parseVaddr(x: UInt): (UInt, UInt) = {
@@ -219,7 +220,7 @@ class TemporalPrefetch(implicit p: Parameters) extends TPModule {
   dataReadQueue.io.enq.bits.way := way_s2
   dataReadQueue.io.enq.bits.wmode := false.B
   dataReadQueue.io.enq.bits.rawData := DontCare
-  dataReadQueue.io.enq.bits.hartid := hartid.U
+  dataReadQueue.io.enq.bits.hartid := io.hartid
 
 
   /* Async Stage: try to fetch or write tpData */
@@ -238,7 +239,7 @@ class TemporalPrefetch(implicit p: Parameters) extends TPModule {
 
   /* Async Stage: get tpMeta and insert it into tpDataQueue */
 
-  tpDataQueue.io.enq.valid := io.tpmeta_port.resp.valid && io.tpmeta_port.resp.bits.hartid === hartid.U
+  tpDataQueue.io.enq.valid := io.tpmeta_port.resp.valid && io.tpmeta_port.resp.bits.hartid === io.hartid
   tpDataQueue.io.enq.bits.rawData := io.tpmeta_port.resp.bits.rawData
   assert(tpDataQueue.io.enq.ready === true.B) // tpDataQueue is never full
 
@@ -294,7 +295,7 @@ class TemporalPrefetch(implicit p: Parameters) extends TPModule {
   dataWriteQueue.io.enq.bits.rawData.zip(recorder_data).foreach(x => x._1 := x._2(35-6, 0))
   dataWriteQueue.io.enq.bits.set := tpTable_w_set
   dataWriteQueue.io.enq.bits.way := tpTable_w_way
-  dataWriteQueue.io.enq.bits.hartid := hartid.U
+  dataWriteQueue.io.enq.bits.hartid := io.hartid
   assert(dataWriteQueue.io.enq.ready === true.B) // TODO: support back-pressure
 
   when(resetIdx === 0.U) {

From cc3034bac4deab4103b08cfc9ad0755367c63213 Mon Sep 17 00:00:00 2001
From: Yangyu Chen <cyy@cyyself.name>
Date: Tue, 9 Apr 2024 17:37:50 +0800
Subject: [PATCH 6/9] configs: fix use hartid from io (#102) (#112)

* configs: fix use hartid from io (#102)

I have written 'tp.io.hartid := tpio.tpmeta_port.get.req.bits.hartid'
before in #103. However, it is not the hartid that comes from tile but
TemporalPrefetcher and caused a loop in Chisel.Queue, as it doesn't use
flow, so we didn't see errors in FIRRTL but we will not get the right
hartId. This commit fixes this by adding an io to the CoupledL2 module and
using hartId input from the L2Top module. To get this done, we must modify
L2Top.scala like this outside this repo.

```diff
diff --git a/src/main/scala/xiangshan/L2Top.scala b/src/main/scala/xiangshan/L2Top.scala
index b4865aba5..07d1668bb 100644
--- a/src/main/scala/xiangshan/L2Top.scala
+++ b/src/main/scala/xiangshan/L2Top.scala
@@ -144,6 +144,7 @@ class L2Top()(implicit p: Parameters) extends LazyModule
     if (l2cache.isDefined) {
       l2_hint := l2cache.get.module.io.l2_hint
       // debugTopDown <> l2cache.get.module.io.debugTopDown
+      l2cache.get.module.io.hartId := hartId.fromTile
       l2cache.get.module.io.debugTopDown.robHeadPaddr := DontCare
       l2cache.get.module.io.debugTopDown.robHeadPaddr.head := debugTopDown.robHeadPaddr
       debugTopDown.l2MissMatch := l2cache.get.module.io.debugTopDown.l2MissMatch.head
```

Finally, this commit also adds DontCare for this new signal to
TestTop.scala.

Signed-off-by: Yangyu Chen <cyy@cyyself.name>

* testtop: fix unconnected signals

Signed-off-by: Yangyu Chen <cyy@cyyself.name>

---------

Signed-off-by: Yangyu Chen <cyy@cyyself.name>
---
 src/main/scala/coupledL2/CoupledL2.scala      |  2 ++
 .../scala/coupledL2/prefetch/Prefetcher.scala | 10 +++++--
 src/test/scala/TestTop.scala                  | 30 ++++++++++++++-----
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/main/scala/coupledL2/CoupledL2.scala b/src/main/scala/coupledL2/CoupledL2.scala
index 2e5700555..c25538da5 100644
--- a/src/main/scala/coupledL2/CoupledL2.scala
+++ b/src/main/scala/coupledL2/CoupledL2.scala
@@ -234,6 +234,7 @@ class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Para
     val banks = node.in.size
     val bankBits = if (banks == 1) 0 else log2Up(banks)
     val io = IO(new Bundle {
+      val hartId = Input(UInt(hartIdLen.W))
     //  val l2_hint = Valid(UInt(32.W))
       val l2_hint = ValidIO(new L2ToL1Hint())
       val debugTopDown = new Bundle {
@@ -286,6 +287,7 @@ class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Para
       _ =>
         fastArb(prefetchTrains.get, prefetcher.get.io.train, Some("prefetch_train"))
         prefetcher.get.io.req.ready := Cat(prefetchReqsReady).orR
+        prefetcher.get.hartId := io.hartId
         fastArb(prefetchResps.get, prefetcher.get.io.resp, Some("prefetch_resp"))
     }
     pf_recv_node match {
diff --git a/src/main/scala/coupledL2/prefetch/Prefetcher.scala b/src/main/scala/coupledL2/prefetch/Prefetcher.scala
index 84d0b0267..36b07c235 100644
--- a/src/main/scala/coupledL2/prefetch/Prefetcher.scala
+++ b/src/main/scala/coupledL2/prefetch/Prefetcher.scala
@@ -125,8 +125,14 @@ class PrefetchQueue(implicit p: Parameters) extends PrefetchModule {
 class Prefetcher(implicit p: Parameters) extends PrefetchModule {
   val io = IO(new PrefetchIO)
   val tpio = IO(new Bundle() {
-    val tpmeta_port = prefetchOpt.map(_ => new tpmetaPortIO)
+    val tpmeta_port = prefetchOpt match {
+      case Some(param: PrefetchReceiverParams) =>
+        if (param.hasTPPrefetcher) Some(new tpmetaPortIO()) else None
+      case _ => None
+    }
   })
+  val hartId = IO(Input(UInt(hartIdLen.W)))
+
   /* io_l2_pf_en:
    * chicken bits for whether L2 prefetchers are enabled
    * it will control BOP and TP prefetchers
@@ -172,7 +178,7 @@ class Prefetcher(implicit p: Parameters) extends PrefetchModule {
       bop.io.resp <> io.resp
       tp.io.train <> io.train
       tp.io.resp <> io.resp
-      tp.io.hartid := tpio.tpmeta_port.get.req.bits.hartid
+      tp.io.hartid := hartId
 
       // send to prq
       pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (bop.io.req.valid || tp.io.req.valid))
diff --git a/src/test/scala/TestTop.scala b/src/test/scala/TestTop.scala
index 63ca3decb..1ae89baf7 100644
--- a/src/test/scala/TestTop.scala
+++ b/src/test/scala/TestTop.scala
@@ -77,6 +77,8 @@ class TestTop_L2()(implicit p: Parameters) extends LazyModule {
       case (node, i) =>
         node.makeIOs()(ValName(s"master_port_$i"))
     }
+
+    l2.module.io.hartId := DontCare
   }
 
 }
@@ -136,7 +138,7 @@ class TestTop_L2L3()(implicit p: Parameters) extends LazyModule {
         rrTagBits = 6
       ))
     )
-  }))).node
+  })))
 
   val l3 = LazyModule(new HuanCun()(new Config((_, _, _) => {
     case HCCacheParamsKey => HCCacheParameters(
@@ -171,7 +173,7 @@ class TestTop_L2L3()(implicit p: Parameters) extends LazyModule {
     TLDelayer(delayFactor) :=*
     l3.node :=*
     TLBuffer() :=
-    l2 :=* xbar
+    l2.node :=* xbar
 
   lazy val module = new LazyModuleImp(this) {
     val timer = WireDefault(0.U(64.W))
@@ -188,6 +190,8 @@ class TestTop_L2L3()(implicit p: Parameters) extends LazyModule {
       case (node, i) =>
         node.makeIOs()(ValName(s"master_port_$i"))
     }
+
+    l2.module.io.hartId := DontCare
   }
 
 }
@@ -217,7 +221,7 @@ class TestTop_L2_Standalone()(implicit p: Parameters) extends LazyModule {
         ),
         channelBytes = TLChannelBeatBytes(cacheParams.blockBytes),
         minLatency = 1,
-        echoFields = cacheParams.echoField,
+        echoFields = Nil,
         requestFields = Seq(AliasField(2)),
         responseKeys = cacheParams.respKey
       )
@@ -280,6 +284,8 @@ class TestTop_L2_Standalone()(implicit p: Parameters) extends LazyModule {
         node.makeIOs()(ValName(s"master_port_$i"))
     }
     l3.makeIOs()(ValName(s"slave_port"))
+
+    l2.module.io.hartId := DontCare
   }
 
 }
@@ -384,7 +390,13 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule {
     dontTouch(clean)
     dontTouch(dump)
 
-    coupledL2.foreach(_.module.io.debugTopDown := DontCare)
+    coupledL2.foreach {
+      case l2 => {
+        l2.module.io.debugTopDown := DontCare
+        l2.module.io.hartId := DontCare
+      }
+    }
+
     master_nodes.zipWithIndex.foreach {
       case (node, i) =>
         node.makeIOs()(ValName(s"master_port_$i"))
@@ -444,7 +456,7 @@ class TestTop_fullSys()(implicit p: Parameters) extends LazyModule {
     master_nodes = master_nodes ++ Seq(l1d, l1i) // TODO
 
     val l1xbar = TLXbar()
-    val l2node = LazyModule(new CoupledL2()(new Config((_, _, _) => {
+    val l2 = LazyModule(new CoupledL2()(new Config((_, _, _) => {
       case L2ParamKey => L2Param(
         name = s"l2$i",
         ways = 4,
@@ -456,12 +468,16 @@ class TestTop_fullSys()(implicit p: Parameters) extends LazyModule {
           rrTagBits = 6
         ))
       )
-    }))).node
+    })))
 
     l1xbar := TLBuffer() := l1i
     l1xbar := TLBuffer() := l1d
 
-    l2xbar := TLBuffer() := l2node := l1xbar
+    l2xbar := TLBuffer() := l2.node := l1xbar
+
+    InModuleBody {
+      l2.module.io.hartId := DontCare
+    }
   }
 
   val l3 = LazyModule(new HuanCun()(new Config((_, _, _) => {

From 3c00e79d14253eb96a23580013f2b074c2d9df1e Mon Sep 17 00:00:00 2001
From: Yanqin Li <maxpicca@qq.com>
Date: Thu, 25 Apr 2024 15:14:16 +0800
Subject: [PATCH 7/9] l2bop: train and prefetch by virtual address (#129)

* misc: fix compiling issue concerning vaddr

* bop: change to virtual address space training

* l2 bop: catch the empty signals

* l2 bop: change to pipeline form and fix bug

* l2 bop: fix request vaddr transfer

* l2 bop: add prefetch req filter

* l2 bop: fix bug of more prefetch requests

* l2 bop: close constantin control initially

* l2 bop: fix bug of chiseldb

* l2 bop: fix bug of chiseldb

* l2bop: change filter and fill invalid entry preferentially

* l2-bop: add req buffer and tlb replay count

* l2bop-fix bug of wire connection

* l2bop: fix bug of wire connection

* l2bop: add replayEn and drop logic

* Prefetch: L2 BOP pftReq-Hit does not response to Prefetcher

* common: delete unused signals

* coupled2: set default io signals assignment

* bop: fix tlb excp signal bug

* bop: avoid uncertain memory access training

* bop: remove fire()

* bop: set double replay

* bop: add bop train record db

* bop: fix alloc bug

* Revert "bop: set double replay"

This reverts commit ba65569a2e76cf2c92091b06573fdaf46f80e608.

* bop: use both vbop and pbop

* prefetch: add pfSource of resp

* bop: fix bug of resp source label

* prefetch: add flag PBOP

* prefetch: add pbop record

* bop: add dynamic diable

* bop: badscore is controlled by constantin

* notmerge-l2bop: upd topdown

* notmerge-l2bop: add constantin badscore and rolling by commitInstr

* bop: baseline of pvbop-02

* bop: add pbop CrossPage

* bop: add offset 117,147,91 on pcp

* prefetch: add delay queue

* l2bop: fix compile bug in minimal config

* bump: utility

* bop: fix compile bug

* testtop: fix test bug

---------

Co-authored-by: wangkaifan <wangkaifan@ict.ac.cn>
Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn>
---
 src/main/scala/coupledL2/Common.scala         |  45 ++
 src/main/scala/coupledL2/CoupledL2.scala      |  17 +-
 src/main/scala/coupledL2/GrantBuffer.scala    |   6 +
 src/main/scala/coupledL2/MSHR.scala           |   6 +-
 src/main/scala/coupledL2/SinkA.scala          |   8 +-
 src/main/scala/coupledL2/Slice.scala          |   3 +
 src/main/scala/coupledL2/TopDownMonitor.scala |  49 +-
 .../prefetch/BestOffsetPrefetch.scala         | 571 +++++++++++++++-
 .../prefetch/PrefetchParameters.scala         |   2 +
 .../coupledL2/prefetch/PrefetchReceiver.scala |   5 +
 .../scala/coupledL2/prefetch/Prefetcher.scala | 625 ++++++++++++------
 .../coupledL2/prefetch/TemporalPrefetch.scala |   1 +
 .../coupledL2/utils/L2PerfCounterUtils.scala  |  33 +
 src/test/scala/TestTop.scala                  |   1 +
 utility                                       |   2 +-
 15 files changed, 1107 insertions(+), 267 deletions(-)

diff --git a/src/main/scala/coupledL2/Common.scala b/src/main/scala/coupledL2/Common.scala
index e87fcf9d7..48d5318cd 100644
--- a/src/main/scala/coupledL2/Common.scala
+++ b/src/main/scala/coupledL2/Common.scala
@@ -275,3 +275,48 @@ class L2ToL1Hint(implicit p: Parameters) extends Bundle {
   val sourceId = UInt(32.W)    // tilelink sourceID
   val isKeyword = Bool()       // miss entry keyword
 }
+
+// custom l2 - l1 tlb
+// FIXME lyq: Tlbcmd and TlbExceptionBundle, how to use L1 corresponding bundles?
+object TlbCmd {
+  def read  = "b00".U
+  def write = "b01".U
+  def exec  = "b10".U
+
+  def atom_read  = "b100".U // lr
+  def atom_write = "b101".U // sc / amo
+
+  def apply() = UInt(3.W)
+  def isRead(a: UInt) = a(1,0)===read
+  def isWrite(a: UInt) = a(1,0)===write
+  def isExec(a: UInt) = a(1,0)===exec
+
+  def isAtom(a: UInt) = a(2)
+  def isAmo(a: UInt) = a===atom_write // NOTE: sc mixed
+}
+class TlbExceptionBundle extends Bundle {
+  val ld = Output(Bool())
+  val st = Output(Bool())
+  val instr = Output(Bool())
+}
+class L2TlbReq(implicit p: Parameters) extends L2Bundle{
+  val vaddr = Output(UInt((fullVAddrBits+offsetBits).W))
+  val cmd = Output(TlbCmd())
+  val size = Output(UInt(log2Ceil(log2Ceil(XLEN/8) + 1).W))
+  val kill = Output(Bool()) // Use for blocked tlb that need sync with other module like icache
+  val no_translate = Output(Bool()) // do not translate, but still do pmp/pma check
+}
+class L2TlbResp(nDups: Int = 1)(implicit p: Parameters) extends L2Bundle {
+  val paddr = Vec(nDups, Output(UInt(fullAddressBits.W)))
+  val miss = Output(Bool())
+  val excp = Vec(nDups, new Bundle {
+    val gpf = new TlbExceptionBundle()
+    val pf = new TlbExceptionBundle()
+    val af = new TlbExceptionBundle()
+  })
+}
+class L2ToL1TlbIO(nRespDups: Int = 1)(implicit p: Parameters) extends L2Bundle{
+  val req = DecoupledIO(new L2TlbReq)
+  val req_kill = Output(Bool())
+  val resp = Flipped(DecoupledIO(new L2TlbResp(nRespDups)))
+}
diff --git a/src/main/scala/coupledL2/CoupledL2.scala b/src/main/scala/coupledL2/CoupledL2.scala
index c25538da5..e6cf2aeba 100644
--- a/src/main/scala/coupledL2/CoupledL2.scala
+++ b/src/main/scala/coupledL2/CoupledL2.scala
@@ -36,6 +36,7 @@ trait HasCoupledL2Parameters {
   val p: Parameters
   val cacheParams = p(L2ParamKey)
 
+  val XLEN = 64
   val blocks = cacheParams.sets * cacheParams.ways
   val blockBytes = cacheParams.blockBytes
   val beatBytes = cacheParams.channelBytes.d.get
@@ -48,8 +49,10 @@ trait HasCoupledL2Parameters {
   val stateBits = MetaData.stateBits
   val aliasBitsOpt = if(cacheParams.clientCaches.isEmpty) None
                   else cacheParams.clientCaches.head.aliasBitsOpt
+  // vaddr without offset bits
   val vaddrBitsOpt = if(cacheParams.clientCaches.isEmpty) None
                   else cacheParams.clientCaches.head.vaddrBitsOpt
+  val fullVAddrBits = vaddrBitsOpt.getOrElse(0) + offsetBits
   // from L1 load miss cache require
   val isKeywordBitsOpt = if(cacheParams.clientCaches.isEmpty) None
                   else cacheParams.clientCaches.head.isKeywordBitsOpt
@@ -233,11 +236,17 @@ class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Para
   class CoupledL2Imp(wrapper: LazyModule) extends LazyModuleImp(wrapper) {
     val banks = node.in.size
     val bankBits = if (banks == 1) 0 else log2Up(banks)
+    val l2TlbParams: Parameters = p.alterPartial {
+      case EdgeInKey => node.in.head._2
+      case EdgeOutKey => node.out.head._2
+      case BankBitsKey => bankBits
+    }
     val io = IO(new Bundle {
       val hartId = Input(UInt(hartIdLen.W))
-    //  val l2_hint = Valid(UInt(32.W))
       val l2_hint = ValidIO(new L2ToL1Hint())
+      val l2_tlb_req = new L2ToL1TlbIO(nRespDups = 1)(l2TlbParams)
       val debugTopDown = new Bundle {
+        val robTrueCommit = Input(UInt(64.W))
         val robHeadPaddr = Vec(cacheParams.hartIds.length, Flipped(Valid(UInt(36.W))))
         val l2MissMatch = Vec(cacheParams.hartIds.length, Output(Bool()))
       }
@@ -283,12 +292,14 @@ class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Para
     val prefetchTrains = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchTrain()(pftParams)))))
     val prefetchResps = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchResp()(pftParams)))))
     val prefetchReqsReady = WireInit(VecInit(Seq.fill(banks)(false.B)))
+    io.l2_tlb_req <> DontCare
     prefetchOpt.foreach {
       _ =>
         fastArb(prefetchTrains.get, prefetcher.get.io.train, Some("prefetch_train"))
         prefetcher.get.io.req.ready := Cat(prefetchReqsReady).orR
         prefetcher.get.hartId := io.hartId
         fastArb(prefetchResps.get, prefetcher.get.io.resp, Some("prefetch_resp"))
+        prefetcher.get.io.tlb_req <> io.l2_tlb_req
     }
     pf_recv_node match {
       case Some(x) =>
@@ -402,6 +413,10 @@ class CoupledL2(implicit p: Parameters) extends LazyModule with HasCoupledL2Para
               prefetchResps.get(i).bits.tag := resp_tag
               prefetchResps.get(i).bits.set := resp_set
             }
+            s.tlb_req.req.valid := false.B
+            s.tlb_req.req.bits := DontCare
+            s.tlb_req.req_kill := DontCare
+            s.tlb_req.resp.ready := true.B
         }
 
         slice
diff --git a/src/main/scala/coupledL2/GrantBuffer.scala b/src/main/scala/coupledL2/GrantBuffer.scala
index c93602ec9..97b4d66a0 100644
--- a/src/main/scala/coupledL2/GrantBuffer.scala
+++ b/src/main/scala/coupledL2/GrantBuffer.scala
@@ -215,6 +215,8 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
   val pftRespEntry = new Bundle() {
     val tag = UInt(tagBits.W)
     val set = UInt(setBits.W)
+    val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
+    val pfSource = UInt(MemReqSource.reqSourceBits.W)
   }
   // TODO: this may not need 10 entries, but this does not take much space
   val pftQueueLen = 10
@@ -224,11 +226,15 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
       io.d_task.bits.task.fromL2pft.getOrElse(false.B)
     pftRespQueue.get.io.enq.bits.tag := io.d_task.bits.task.tag
     pftRespQueue.get.io.enq.bits.set := io.d_task.bits.task.set
+    pftRespQueue.get.io.enq.bits.vaddr.foreach(_ := io.d_task.bits.task.vaddr.getOrElse(0.U))
+    pftRespQueue.get.io.enq.bits.pfSource := io.d_task.bits.task.reqSource
 
     val resp = io.prefetchResp.get
     resp.valid := pftRespQueue.get.io.deq.valid
     resp.bits.tag := pftRespQueue.get.io.deq.bits.tag
     resp.bits.set := pftRespQueue.get.io.deq.bits.set
+    resp.bits.vaddr.foreach(_ := pftRespQueue.get.io.deq.bits.vaddr.getOrElse(0.U))
+    resp.bits.pfSource := pftRespQueue.get.io.deq.bits.pfSource
     pftRespQueue.get.io.deq.ready := resp.ready
 
     assert(pftRespQueue.get.io.enq.ready, "pftRespQueue should never be full, no back pressure logic")
diff --git a/src/main/scala/coupledL2/MSHR.scala b/src/main/scala/coupledL2/MSHR.scala
index fb7e344a0..46e287b80 100644
--- a/src/main/scala/coupledL2/MSHR.scala
+++ b/src/main/scala/coupledL2/MSHR.scala
@@ -165,7 +165,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
     mp_release.set := req.set
     mp_release.off := 0.U
     mp_release.alias.foreach(_ := 0.U)
-    mp_release.vaddr.foreach(_ := 0.U)
+    mp_release.vaddr.foreach(_ := req.vaddr.getOrElse(0.U))
     mp_release.isKeyword.foreach(_ := false.B)
     // if dirty, we must ReleaseData
     // if accessed, we ReleaseData to keep the data in L3, for future access to be faster
@@ -212,7 +212,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
     mp_probeack.set := req.set
     mp_probeack.off := req.off
     mp_probeack.alias.foreach(_ := 0.U)
-    mp_probeack.vaddr.foreach(_ := 0.U)
+    mp_probeack.vaddr.foreach(_ := req.vaddr.getOrElse(0.U))
     mp_probeack.isKeyword.foreach(_ := false.B)
     mp_probeack.opcode := Mux(
       meta.dirty && isT(meta.state) || probeDirty || req.needProbeAckData,
@@ -281,7 +281,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
     mp_grant.off := req.off
     mp_grant.sourceId := req.sourceId
     mp_grant.alias.foreach(_ := 0.U)
-    mp_grant.vaddr.foreach(_ := 0.U)
+    mp_grant.vaddr.foreach(_ := req.vaddr.getOrElse(0.U))
     mp_grant.isKeyword.foreach(_ := req.isKeyword.getOrElse(false.B))
     mp_grant.opcode := odOpGen(req.opcode)
     mp_grant.param := Mux(
diff --git a/src/main/scala/coupledL2/SinkA.scala b/src/main/scala/coupledL2/SinkA.scala
index fcdad4b22..da1423843 100644
--- a/src/main/scala/coupledL2/SinkA.scala
+++ b/src/main/scala/coupledL2/SinkA.scala
@@ -90,8 +90,8 @@ class SinkA(implicit p: Parameters) extends L2Module {
     task.mshrId := 0.U(mshrBits.W)
     task.aliasTask.foreach(_ := false.B)
     task.useProbeData := false.B
+    task.fromL2pft.foreach(_ := req.needAck)
     task.mshrRetry := false.B
-    task.fromL2pft.foreach(_ := req.isBOP)
     task.needHint.foreach(_ := false.B)
     task.dirty := false.B
     task.way := 0.U(wayBits.W)
@@ -102,7 +102,7 @@ class SinkA(implicit p: Parameters) extends L2Module {
     task.wayMask := 0.U(cacheParams.ways.W)
     task.reqSource := req.pfSource
     task.replTask := false.B
-    task.vaddr.foreach(_ := 0.U)
+    task.vaddr.foreach(_ := req.vaddr.getOrElse(0.U))
     task.isKeyword.foreach(_ := false.B)
     task.mergeA := false.B
     task.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle)
@@ -134,8 +134,8 @@ class SinkA(implicit p: Parameters) extends L2Module {
   prefetchOpt.foreach {
     _ =>
       XSPerfAccumulate(cacheParams, "sinkA_prefetch_req", io.prefetchReq.get.fire)
-      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l2", io.prefetchReq.get.bits.isBOP && io.prefetchReq.get.fire)
-      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l1", !io.prefetchReq.get.bits.isBOP && io.prefetchReq.get.fire)
+      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l2", io.prefetchReq.get.bits.fromL2 && io.prefetchReq.get.fire)
+      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l1", !io.prefetchReq.get.bits.fromL2 && io.prefetchReq.get.fire)
   }
 
   // cycels stalled by mainpipe
diff --git a/src/main/scala/coupledL2/Slice.scala b/src/main/scala/coupledL2/Slice.scala
index a525e1b4b..96f79af6c 100644
--- a/src/main/scala/coupledL2/Slice.scala
+++ b/src/main/scala/coupledL2/Slice.scala
@@ -141,6 +141,9 @@ class Slice()(implicit p: Parameters) extends L2Module {
       p.train <> mainPipe.io.prefetchTrain.get
       sinkA.io.prefetchReq.get <> p.req
       p.resp <> grantBuf.io.prefetchResp.get
+      p.tlb_req.req.ready := true.B
+      p.tlb_req.resp.valid := false.B
+      p.tlb_req.resp.bits := DontCare
       p.recv_addr := 0.U.asTypeOf(p.recv_addr)
   }
 
diff --git a/src/main/scala/coupledL2/TopDownMonitor.scala b/src/main/scala/coupledL2/TopDownMonitor.scala
index 4b8fdd642..975d7bfbf 100644
--- a/src/main/scala/coupledL2/TopDownMonitor.scala
+++ b/src/main/scala/coupledL2/TopDownMonitor.scala
@@ -30,6 +30,7 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
     val msStatus  = Vec(banks, Vec(mshrsAll, Flipped(ValidIO(new MSHRStatus))))
     val latePF    = Vec(banks, Input(Bool()))
     val debugTopDown = new Bundle {
+      val robTrueCommit = Input(UInt(64.W))
       val robHeadPaddr = Vec(cacheParams.hartIds.length, Flipped(Valid(UInt(36.W))))
       val l2MissMatch = Vec(cacheParams.hartIds.length, Output(Bool()))
     }
@@ -112,6 +113,7 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
   val l2prefetchSent = dirResultMatchVec(
     r =>  !r.hit &&
       (r.replacerInfo.reqSource === MemReqSource.Prefetch2L2BOP.id.U ||
+       r.replacerInfo.reqSource === MemReqSource.Prefetch2L2PBOP.id.U ||
        r.replacerInfo.reqSource === MemReqSource.Prefetch2L2SMS.id.U ||
        r.replacerInfo.reqSource === MemReqSource.Prefetch2L2Stride.id.U ||
        r.replacerInfo.reqSource === MemReqSource.Prefetch2L2Stream.id.U ||
@@ -120,6 +122,9 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
   val l2prefetchSentBOP = dirResultMatchVec(
     r => !r.hit && r.replacerInfo.reqSource === MemReqSource.Prefetch2L2BOP.id.U
   )
+  val l2prefetchSentPBOP = dirResultMatchVec(
+    r => !r.hit && r.replacerInfo.reqSource === MemReqSource.Prefetch2L2PBOP.id.U
+  )
   val l2prefetchSentSMS = dirResultMatchVec(
     r => !r.hit && r.replacerInfo.reqSource === MemReqSource.Prefetch2L2SMS.id.U
   )
@@ -140,6 +145,10 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
     r => reqFromCPU(r) && r.hit &&
       r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === PfSource.BOP.id.U
   )
+  val l2prefetchUsefulPBOP = dirResultMatchVec(
+    r => reqFromCPU(r) && r.hit &&
+      r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === PfSource.PBOP.id.U
+  )
   val l2prefetchUsefulSMS = dirResultMatchVec(
     r => reqFromCPU(r) && r.hit &&
       r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === PfSource.SMS.id.U
@@ -166,81 +175,91 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracy",
     PopCount(l2prefetchUseful), PopCount(l2prefetchSent),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyBOP",
     PopCount(l2prefetchUsefulBOP), PopCount(l2prefetchSentBOP),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
+  )
+  XSPerfRolling(
+    cacheParams, "L2PrefetchAccuracyPBOP",
+    PopCount(l2prefetchUsefulPBOP), PopCount(l2prefetchSentPBOP),
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracySMS",
     PopCount(l2prefetchUsefulSMS), PopCount(l2prefetchSentSMS),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2prefetchSentTP),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyStride",
     PopCount(l2prefetchUsefulStride), PopCount(l2prefetchSentStride),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyStream",
     PopCount(l2prefetchUsefulStream), PopCount(l2prefetchSentStream),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2prefetchSentTP),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
 
   // PF Late
   XSPerfRolling(
     cacheParams, "L2PrefetchLate",
     PopCount(l2prefetchLate), PopCount(l2prefetchUseful),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
 
   // PF Coverage
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverage",
     PopCount(l2prefetchUseful), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageBOP",
     PopCount(l2prefetchUsefulBOP), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
+  )
+  XSPerfRolling(
+    cacheParams, "L2PrefetchCoveragePBOP",
+    PopCount(l2prefetchUsefulPBOP), PopCount(l2demandRequest),
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageSMS",
     PopCount(l2prefetchUsefulSMS), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageStride",
     PopCount(l2prefetchUsefulStride), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageStream",
     PopCount(l2prefetchUsefulStream), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
 
   XSPerfAccumulate(cacheParams, "l2prefetchSent", PopCount(l2prefetchSent))
diff --git a/src/main/scala/coupledL2/prefetch/BestOffsetPrefetch.scala b/src/main/scala/coupledL2/prefetch/BestOffsetPrefetch.scala
index a8617d1bc..742bcdcd4 100644
--- a/src/main/scala/coupledL2/prefetch/BestOffsetPrefetch.scala
+++ b/src/main/scala/coupledL2/prefetch/BestOffsetPrefetch.scala
@@ -17,29 +17,41 @@
 
 package coupledL2.prefetch
 
-import utility.{MemReqSource, SRAMTemplate}
+import utility.{ChiselDB, Constantin, MemReqSource, ParallelPriorityMux, RRArbiterInit, SRAMTemplate}
 import org.chipsalliance.cde.config.Parameters
+import chisel3.DontCare.:=
 import chisel3._
 import chisel3.util._
-import coupledL2.HasCoupledL2Parameters
-import coupledL2.utils.XSPerfAccumulate
+import coupledL2.{HasCoupledL2Parameters, L2TlbReq, L2ToL1TlbIO, TlbCmd}
+import coupledL2.utils.{ReplacementPolicy, XSPerfAccumulate}
+import scopt.Read
 
 case class BOPParameters(
+  virtualTrain: Boolean = true,
   rrTableEntries: Int = 256,
   rrTagBits:      Int = 12,
   scoreBits:      Int = 5,
   roundMax:       Int = 50,
-  badScore:       Int = 1,
+  badScore:       Int = 2,
+  tlbReplayCnt:   Int = 10,
+  dQEntries: Int = 16,
+  dQLatency: Int = 175,
+  dQMaxLatency: Int = 256,
   offsetList: Seq[Int] = Seq(
-    -32, -30, -27, -25, -24, -20, -18, -16, -15,
-    -12, -10,  -9,  -8,  -6,  -5,  -4,  -3,  -2,  -1,
-      1,   2,   3,   4,   5,   6,   8,   9,  10,
-     12,  15,  16,  18,  20,  24,  25,  27,  30//,
-    /*32,  36,
-     40,  45,  48,  50,  54,  60,  64,  72,  75,  80,
-     81,  90,  96, 100, 108, 120, 125, 128, 135, 144,
-    150, 160, 162, 180, 192, 200, 216, 225, 240, 243,
-    250, 256*/
+    -256, -250, -243, -240, -225, -216, -200,
+    -192, -180, -162, -160, -150, -144, -135, -128,
+    -125, -120, -108, -100, -96, -90, -81, -80,
+    -75, -72, -64, -60, -54, -50, -48, -45,
+    -40, -36, -32, -30, -27, -25, -24, -20,
+    -18, -16, -15, -12, -10, -9, -8, -6,
+    -5, -4, -3, -2, -1,
+    1, 2, 3, 4, 5, 6, 8,
+    9, 10, 12, 15, 16, 18, 20, 24,
+    25, 27, 30, 32, 36, 40, 45, 48,
+    50, 54, 60, 64, 72, 75, 80, 81,
+    90, 96, 100, 108, 120, 125, 128, 135,
+    144, 150, 160, 162, 180, 192, 200, 216,
+    225, 240, 243, 250/*, 256*/
   ))
     extends PrefetchParameters {
   override val hasPrefetchBit:  Boolean = true
@@ -47,23 +59,34 @@ case class BOPParameters(
   override val inflightEntries: Int = 16
 }
 
-trait HasBOPParams extends HasCoupledL2Parameters {
+trait HasBOPParams extends HasPrefetcherHelper {
   val bopParams = prefetchOpt.get.asInstanceOf[BOPParameters]
+
+  // train address space: virtual or physical
+  val virtualTrain = bopParams.virtualTrain
+  val fullAddrBits = if(virtualTrain) fullVAddrBits else fullAddressBits
+  val noOffsetAddrBits = fullAddrBits - offsetBits
+  override val REQ_FILTER_SIZE = 16
+
   // Best offset
   val defaultMinAddrBits = offsetBits + log2Up(bopParams.rrTableEntries) + bopParams.rrTagBits
-  val defaultConfig = fullAddressBits >= defaultMinAddrBits
+  val defaultConfig = fullAddrBits >= defaultMinAddrBits
 
   val rrTableEntries = if (defaultConfig) bopParams.rrTableEntries else 2
   val rrIdxBits = log2Up(rrTableEntries)
-  val rrTagBits = if (defaultConfig) bopParams.rrTagBits else (fullAddressBits - offsetBits - rrIdxBits)
+  val rrTagBits = if (defaultConfig) bopParams.rrTagBits else (fullAddrBits - offsetBits - rrIdxBits)
   val scoreBits = bopParams.scoreBits
   val roundMax = bopParams.roundMax
   val badScore = bopParams.badScore
+  val initScore = bopParams.badScore + 1
   val offsetList = bopParams.offsetList
   val inflightEntries = bopParams.inflightEntries
+  val dQEntries = bopParams.dQEntries
+  val dQLatency = bopParams.dQLatency
+  val dQMaxLatency = bopParams.dQMaxLatency
 
   val scores = offsetList.length
-  val offsetWidth = log2Up(-offsetList(0)) + 1 // -32 <= offset <= 31
+  val offsetWidth = log2Up(offsetList.max) + 2 // -32 <= offset <= 31
   val roundBits = log2Up(roundMax)
   val scoreMax = (1 << scoreBits) - 1
   val scoreTableIdxBits = log2Up(scores)
@@ -95,7 +118,7 @@ class ScoreTableEntry(implicit p: Parameters) extends BOPBundle {
 
 class TestOffsetReq(implicit p: Parameters) extends BOPBundle {
   // find whether (X-d) is in recent request table
-  val addr = UInt(fullAddressBits.W)
+  val addr = UInt(fullAddrBits.W)
   val testOffset = UInt(offsetWidth.W)
   val ptr = UInt(scoreTableIdxBits.W)
 }
@@ -113,7 +136,7 @@ class TestOffsetBundle(implicit p: Parameters) extends BOPBundle {
 
 class RecentRequestTable(implicit p: Parameters) extends BOPModule {
   val io = IO(new Bundle {
-    val w = Flipped(DecoupledIO(UInt(fullAddressBits.W)))
+    val w = Flipped(DecoupledIO(UInt(fullAddrBits.W)))
     val r = Flipped(new TestOffsetBundle)
   })
 
@@ -124,7 +147,7 @@ class RecentRequestTable(implicit p: Parameters) extends BOPModule {
   //        +-------+------------------+---------------+----------------------+
   //    or: |  ...  |    12-bit tag    |  8-bit hash1  |  6-bit cache offset  |
   //        +-------+------------------+---------------+----------------------+
-  def lineAddr(addr: UInt) = addr(fullAddressBits - 1, offsetBits)
+  def lineAddr(addr: UInt) = addr(fullAddrBits - 1, offsetBits)
   def hash1(addr:    UInt) = lineAddr(addr)(rrIdxBits - 1, 0)
   def hash2(addr:    UInt) = lineAddr(addr)(2 * rrIdxBits - 1, rrIdxBits)
   def idx(addr:      UInt) = hash1(addr) ^ hash2(addr)
@@ -144,7 +167,7 @@ class RecentRequestTable(implicit p: Parameters) extends BOPModule {
   rrTable.io.w.req.bits.data(0).valid := true.B
   rrTable.io.w.req.bits.data(0).tag := tag(wAddr)
 
-  val rAddr = io.r.req.bits.addr - signedExtend((io.r.req.bits.testOffset << offsetBits), fullAddressBits)
+  val rAddr = io.r.req.bits.addr - signedExtend((io.r.req.bits.testOffset << offsetBits), fullAddrBits)
   val rData = Wire(rrTableEntry())
   rrTable.io.r.req.valid := io.r.req.fire
   rrTable.io.r.req.bits.setIdx := idx(rAddr)
@@ -160,23 +183,27 @@ class RecentRequestTable(implicit p: Parameters) extends BOPModule {
 
 }
 
-class OffsetScoreTable(implicit p: Parameters) extends BOPModule {
+class OffsetScoreTable(name: String = "")(implicit p: Parameters) extends BOPModule {
   val io = IO(new Bundle {
-    val req = Flipped(DecoupledIO(UInt(fullAddressBits.W)))
+    val req = Flipped(DecoupledIO(UInt(fullAddrBits.W)))
     val prefetchOffset = Output(UInt(offsetWidth.W))
+    val prefetchDisable = Output(Bool())
     val test = new TestOffsetBundle
   })
 
   val prefetchOffset = RegInit(2.U(offsetWidth.W))
+  val prefetchDisable = RegInit(false.B)
   // score table
   // val st = RegInit(VecInit(offsetList.map(off => (new ScoreTableEntry).apply(off.U, 0.U))))
   val st = RegInit(VecInit(Seq.fill(scores)((new ScoreTableEntry).apply(0.U))))
   val offList = WireInit(VecInit(offsetList.map(off => off.S(offsetWidth.W).asUInt)))
   val ptr = RegInit(0.U(scoreTableIdxBits.W))
   val round = RegInit(0.U(roundBits.W))
-
+  
+  val badscoreConstant = WireInit(Constantin.createRecord(name+"BadScore", bopParams.badScore.U))
+  val initscoreConstant = WireInit(Constantin.createRecord(name+"InitScore", (bopParams.badScore+1).U))
   val bestOffset = RegInit(2.U(offsetWidth.W)) // the entry with the highest score while traversing
-  val bestScore = RegInit(badScore.U(scoreBits.W))
+  val bestScore = RegInit(10.U)
   val testOffset = offList(ptr)
   // def winner(e1: ScoreTableEntry, e2: ScoreTableEntry): ScoreTableEntry = {
   //   val w = Wire(new ScoreTableEntry)
@@ -190,12 +217,14 @@ class OffsetScoreTable(implicit p: Parameters) extends BOPModule {
   // 1. At the start of a learning phase
   // All the scores are reset to 0.
   // At the end of every learning phase, the prefetch offset is updated as the one with the highest score.
+  val isBad = bestScore < badscoreConstant
   when(state === s_idle) {
     st.foreach(_.score := 0.U)
     ptr := 0.U
     round := 0.U
-    bestScore := badScore.U
+    bestScore := 0.U
     prefetchOffset := bestOffset
+    prefetchDisable := isBad
     state := s_learn
   }
 
@@ -237,12 +266,17 @@ class OffsetScoreTable(implicit p: Parameters) extends BOPModule {
 
   io.req.ready := state === s_learn
   io.prefetchOffset := prefetchOffset
+  io.prefetchDisable := prefetchDisable
   io.test.req.valid := state === s_learn && io.req.valid
   io.test.req.bits.addr := io.req.bits
   io.test.req.bits.testOffset := testOffset
   io.test.req.bits.ptr := ptr
   io.test.resp.ready := true.B
 
+  XSPerfAccumulate(cacheParams, "total_learn_phase", state === s_idle)
+  XSPerfAccumulate(cacheParams, "total_bop_disable", state === s_idle && isBad)
+  XSPerfAccumulate(cacheParams, "total_bop_high_confidence", state === s_idle && bestScore === scoreMax.U)
+
   for (off <- offsetList) {
     if (off < 0) {
       XSPerfAccumulate(cacheParams, "best_offset_neg_" + (-off).toString + "_learning_phases",
@@ -253,25 +287,489 @@ class OffsetScoreTable(implicit p: Parameters) extends BOPModule {
     }
   }
 
+  // FIXME lyq: remove the db
+  class BopTrainEntry extends Bundle {
+    val bestOffset = UInt(offsetWidth.W)
+    val bestScore = UInt(scoreBits.W)
+  }
+
+  val l2BopTrainTable = ChiselDB.createTable("L2BopTrainTable", new BopTrainEntry, basicDB = true)
+  for (i <- 0 until REQ_FILTER_SIZE) {
+    val data = Wire(new BopTrainEntry)
+    data.bestOffset := bestOffset
+    data.bestScore := bestScore
+    // l2BopTrainTable.log(data = data, en = (state === s_idle) && !isBad, site = name+"OffsetScoreTable", clock, reset)
+    l2BopTrainTable.log(data = data, en = (state === s_idle) && !isBad, site = name+"OffsetScoreTable", clock, reset)
+  }
+
+}
+
+class BopReqBundle(implicit p: Parameters) extends BOPBundle{
+  val full_vaddr = UInt(fullVAddrBits.W)
+  val base_vaddr = UInt(vaddrBitsOpt.getOrElse(0).W)
+  val needT = Bool()
+  val source = UInt(sourceIdBits.W)
+  val isBOP = Bool()
+}
+
+class BopReqBufferEntry(implicit p: Parameters) extends BOPBundle {
+  val valid = Bool()
+  // for tlb req
+  val paddrValid = Bool()
+  val vaddrNoOffset = UInt((fullVAddrBits-offsetBits).W)
+  val baseVaddr = UInt((fullVAddrBits-offsetBits).W)
+  val paddrNoOffset = UInt(fullVAddrBits.W)
+  val replayEn = Bool()
+  val replayCnt = UInt(4.W)
+  // for pf req
+  val needT = Bool()
+  val source = UInt(sourceIdBits.W)
+
+  def reset(x: UInt): Unit = {
+    valid := false.B
+    paddrValid := false.B
+    vaddrNoOffset := 0.U
+    baseVaddr := 0.U
+    paddrNoOffset := 0.U
+    replayEn := false.B
+    replayCnt := 0.U
+    needT := false.B
+    source := 0.U
+  }
+
+  def fromBopReqBundle(req: BopReqBundle) = {
+    valid := true.B
+    paddrValid := false.B
+    vaddrNoOffset := get_block_vaddr(req.full_vaddr)
+    baseVaddr := req.base_vaddr
+    replayEn := false.B
+    replayCnt := 0.U
+    paddrNoOffset := 0.U
+    needT := req.needT
+    source := req.source
+  }
+
+  def isEqualBopReq(req: BopReqBundle) = {
+    // FIXME lyq: the comparision logic is very complicated, is there a way to simplify
+    valid &&
+    vaddrNoOffset === get_block_vaddr(req.full_vaddr) &&
+    baseVaddr === req.base_vaddr &&
+    needT === req.needT &&
+    source === req.source
+  }
+
+  def toPrefetchReq(): PrefetchReq = {
+    val req = Wire(new PrefetchReq)
+    req.tag := parseFullAddress(get_pf_paddr())._1
+    req.set := parseFullAddress(get_pf_paddr())._2
+    req.vaddr.foreach(_ := baseVaddr)
+    req.needT := needT
+    req.source := source
+    req.pfSource := MemReqSource.Prefetch2L2BOP.id.U
+    req
+  }
+
+  def can_send_pf(): Bool = {
+    valid && paddrValid
+  }
+
+  def get_pf_paddr(): UInt = {
+    Cat(paddrNoOffset, 0.U(offsetBits.W))
+  }
+
+  def get_tlb_vaddr(): UInt = {
+    Cat(vaddrNoOffset, 0.U(offsetBits.W))
+  }
+
+  def update_paddr(paddr: UInt) = {
+    paddrValid := true.B
+    paddrNoOffset := paddr(paddr.getWidth-1, offsetBits)
+    replayEn := false.B
+    replayCnt := 0.U
+  }
+
+  def update_sent(): Unit ={
+    valid := false.B
+  }
+
+  def update_excp(): Unit = {
+    valid := false.B
+  }
+}
+
+class PrefetchReqBuffer(implicit p: Parameters) extends BOPModule{
+  val io = IO(new Bundle() {
+    val in_req = Flipped(ValidIO(new BopReqBundle))
+    val tlb_req = new L2ToL1TlbIO(nRespDups = 1)
+    val out_req = DecoupledIO(new PrefetchReq)
+  })
+
+  val firstTlbReplayCnt = WireInit(Constantin.createRecord("firstTlbReplayCnt", bopParams.tlbReplayCnt.U))
+
+  def wayMap[T <: Data](f: Int => T) = VecInit((0 until REQ_FILTER_SIZE).map(f))
+  def get_flag(vaddr: UInt) = get_block_vaddr(vaddr)
+
+  // if full then drop new req, so there is no need to use s1_evicted_oh & replacement
+  val entries = Seq.fill(REQ_FILTER_SIZE)(Reg(new BopReqBufferEntry))
+  //val replacement = ReplacementPolicy.fromString("plru", REQ_FILTER_SIZE)
+  val tlb_req_arb = Module(new RRArbiterInit(new L2TlbReq, REQ_FILTER_SIZE))
+  val pf_req_arb = Module(new RRArbiterInit(new PrefetchReq, REQ_FILTER_SIZE))
+
+  io.tlb_req.req <> tlb_req_arb.io.out
+  io.tlb_req.req_kill := false.B
+  io.tlb_req.resp.ready := true.B
+  io.out_req <> pf_req_arb.io.out
+
+  /* s0: entries look up */
+  val prev_in_valid = RegNext(io.in_req.valid, false.B)
+  val prev_in_req = RegEnable(io.in_req.bits, io.in_req.valid)
+  val prev_in_flag = get_flag(prev_in_req.full_vaddr)
+  // s1 entry update
+  val alloc = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+
+  val s0_in_req = io.in_req.bits
+  val s0_in_flag = get_flag(s0_in_req.full_vaddr)
+  val s0_conflict_prev = prev_in_valid && s0_in_flag === prev_in_flag
+  // FIXME lyq: the comparision logic is very complicated, is there a way to simplify
+  val s0_match_oh = VecInit(entries.indices.map(i =>
+    entries(i).valid && entries(i).vaddrNoOffset === s0_in_flag &&
+    entries(i).needT === s0_in_req.needT && entries(i).source === s0_in_req.source &&
+    entries(i).baseVaddr === s0_in_req.base_vaddr
+  )).asUInt
+  val s0_match = Cat(s0_match_oh).orR
+
+  val s0_invalid_vec = wayMap(w => !entries(w).valid && !alloc(w))
+  val s0_has_invalid_way = s0_invalid_vec.asUInt.orR
+  val s0_invalid_oh = ParallelPriorityMux(s0_invalid_vec.zipWithIndex.map(x => x._1 -> UIntToOH(x._2.U(REQ_FILTER_SIZE.W))))
+
+  val s0_req_valid = io.in_req.valid && !s0_conflict_prev && !s0_match && s0_has_invalid_way
+  val s0_tlb_fire_oh = VecInit(tlb_req_arb.io.in.map(_.fire)).asUInt
+  val s0_pf_fire_oh = VecInit(pf_req_arb.io.in.map(_.fire)).asUInt
+  //val s0_access_way = Mux(s0_match, OHToUInt(s0_match_oh), OHToUInt(s0_replace_oh))
+  //when(s0_req_valid){
+  //  replacement.access(s0_access_way)
+  //}
+  XSPerfAccumulate(cacheParams, "recv_req", io.in_req.valid)
+  XSPerfAccumulate(cacheParams, "recv_req_drop_conflict", io.in_req.valid && s0_conflict_prev)
+  XSPerfAccumulate(cacheParams, "recv_req_drop_match", io.in_req.valid && !s0_conflict_prev && s0_match)
+  XSPerfAccumulate(cacheParams, "recv_req_drop_full", io.in_req.valid && !s0_conflict_prev && !s0_match && !s0_has_invalid_way)
+
+
+  /* s1 update and replace */
+  val s1_valid = RegNext(s0_req_valid, false.B)
+  val s1_in_req = RegEnable(s0_in_req, s0_req_valid)
+  val s1_invalid_oh = RegEnable(s0_invalid_oh, 0.U, s0_req_valid)
+  val s1_pf_fire_oh = RegNext(s0_pf_fire_oh, 0.U)
+  val s1_tlb_fire_oh = RegNext(s0_tlb_fire_oh, 0.U)
+  val s1_alloc_entry = Wire(new BopReqBufferEntry)
+  s1_alloc_entry.fromBopReqBundle(s1_in_req)
+
+  /* entry update */
+  val exp_drop = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  val miss_drop = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  val miss_first_replay = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  val pf_fired = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  val tlb_fired = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  for ((e, i) <- entries.zipWithIndex){
+    alloc(i) := s1_valid && s1_invalid_oh(i)
+    pf_fired(i) := s0_pf_fire_oh(i)
+    exp_drop(i) := s1_tlb_fire_oh(i) && io.tlb_req.resp.valid && !io.tlb_req.resp.bits.miss &&
+      ((e.needT && (io.tlb_req.resp.bits.excp.head.pf.st || io.tlb_req.resp.bits.excp.head.af.st)) ||
+      (!e.needT && (io.tlb_req.resp.bits.excp.head.pf.ld || io.tlb_req.resp.bits.excp.head.af.ld)))
+    val miss = s1_tlb_fire_oh(i) && io.tlb_req.resp.valid && io.tlb_req.resp.bits.miss
+    tlb_fired(i) := s1_tlb_fire_oh(i) && io.tlb_req.resp.valid && !io.tlb_req.resp.bits.miss && !exp_drop(i)
+    miss_drop(i) := miss && e.replayEn
+    miss_first_replay(i) := miss && !e.replayEn
+    
+    // old data: update replayCnt
+    when(e.valid && e.replayCnt.orR) {
+      e.replayCnt := e.replayCnt - 1.U
+    }
+    // recent data: update tlb resp
+    when(tlb_fired(i)){
+      e.update_paddr(io.tlb_req.resp.bits.paddr.head)
+    }.elsewhen(miss_drop(i)) { // miss
+      e.reset(i.U)
+    }.elsewhen(miss_first_replay(i)){
+      e.replayCnt := firstTlbReplayCnt
+      e.replayEn := 1.U
+    }.elsewhen(exp_drop(i)){
+      e.update_excp()
+    }
+    // issue data: update pf
+    when(pf_fired(i)){
+      e.update_sent()
+    }
+    // new data: update data
+    when(alloc(i)){
+      e := s1_alloc_entry
+    }
+  }
+
+  /* tlb & pf */
+  for((e, i) <- entries.zipWithIndex){
+    //tlb_req_arb.io.in(i).valid := e.valid && !s1_tlb_fire_oh(i) && !s2_tlb_fire_oh(i) && !e.paddrValid && !s1_evicted_oh(i)
+    tlb_req_arb.io.in(i).valid := e.valid && !e.paddrValid && !s1_tlb_fire_oh(i) && !e.replayCnt.orR
+    tlb_req_arb.io.in(i).bits.vaddr := e.get_tlb_vaddr()
+    when(e.needT) {
+      tlb_req_arb.io.in(i).bits.cmd := TlbCmd.write
+    }.otherwise{
+      tlb_req_arb.io.in(i).bits.cmd := TlbCmd.read
+    }
+    tlb_req_arb.io.in(i).bits.size := 3.U
+    tlb_req_arb.io.in(i).bits.kill := false.B
+    tlb_req_arb.io.in(i).bits.no_translate := false.B
+
+    pf_req_arb.io.in(i).valid := e.can_send_pf()
+    pf_req_arb.io.in(i).bits := e.toPrefetchReq()
+  }
+
+  // reset meta to avoid muti-hit problem
+  for (i <- 0 until REQ_FILTER_SIZE) {
+    when(reset.asBool) {
+      entries(i).reset(i.U)
+    }
+  }
+
+  XSPerfAccumulate(cacheParams, "tlb_req", io.tlb_req.req.valid)
+  XSPerfAccumulate(cacheParams, "tlb_miss", io.tlb_req.resp.valid && io.tlb_req.resp.bits.miss)
+  XSPerfAccumulate(cacheParams, "tlb_excp",
+    io.tlb_req.resp.valid && !io.tlb_req.resp.bits.miss && (
+      io.tlb_req.resp.bits.excp.head.pf.st || io.tlb_req.resp.bits.excp.head.af.st ||
+      io.tlb_req.resp.bits.excp.head.pf.ld || io.tlb_req.resp.bits.excp.head.af.ld
+  ))
+  XSPerfAccumulate(cacheParams, "entry_alloc", PopCount(alloc))
+  XSPerfAccumulate(cacheParams, "entry_miss_first_replay", PopCount(miss_first_replay))
+  XSPerfAccumulate(cacheParams, "entry_miss_drop", PopCount(miss_drop))
+  XSPerfAccumulate(cacheParams, "entry_excp", PopCount(exp_drop))
+  XSPerfAccumulate(cacheParams, "entry_merge", io.in_req.valid && s0_match)
+  XSPerfAccumulate(cacheParams, "entry_pf_fire", PopCount(pf_fired))
+  
+  /*
+  val enTalbe = WireInit(Constantin.createRecord("isWriteL2BopTable", 1.U))
+  val l2BOPTable = ChiselDB. createTable("L2BOPTable", new BopReqBufferEntry, basicDB = true)
+  for (i <- 0 until REQ_FILTER_SIZE){
+    when(alloc(i)){
+      l2BOPTable.log(
+        data = entries(i),
+        en = enTalbe.orR && pf_fired(i),
+        site = "L2BOPTable",
+        clock = clock,
+        reset = reset
+      )
+    }
+  }
+  */
+}
+
+class DelayQueue(name: String = "")(implicit p: Parameters) extends  BOPModule{
+  val io = IO(new Bundle(){
+    val in = Flipped(DecoupledIO(UInt(noOffsetAddrBits.W)))
+    val out = DecoupledIO(UInt(fullAddrBits.W))
+    // only record `fullAddrBits - offsetBits` bits
+    // out.bits = Cat(record, 0.U(offsetBits))
+  })
+
+  /* Setting */
+  val IdxWidth = log2Up(dQEntries)
+  val LatencyWidth = log2Up(dQMaxLatency)
+  class Entry extends Bundle{
+    val addrNoOffset = UInt(noOffsetAddrBits.W)
+    val cnt = UInt(LatencyWidth.W)
+  }
+  val queue = RegInit(VecInit(Seq.fill(dQEntries)(0.U.asTypeOf(new Entry))))
+  val valids = RegInit(VecInit(Seq.fill(dQEntries)(false.B)))
+  val head = RegInit(0.U(IdxWidth.W))
+  val tail = RegInit(0.U(IdxWidth.W))
+  val empty = head === tail && !valids.last
+  val full = head === tail && valids.last
+  val outValid = !empty && !queue(head).cnt.orR && valids(head)
+
+  /* In & Out */
+  var setDqLatency = WireInit(Constantin.createRecord("DelayQueueLatency"+name, dQLatency.U))
+  when(io.in.valid && !full) {
+    // if queue is full, we drop the new request
+    queue(tail).addrNoOffset := io.in.bits
+    queue(tail).cnt := setDqLatency // dQLatency.U
+    valids(tail) := true.B
+    tail := tail + 1.U
+
+    /*
+    // if full, drop the old request
+    when(full && !io.deq.ready) {
+      head := head + 1.U
+    }
+    */
+  }
+  when(outValid && io.out.ready) {
+    valids(head) := false.B
+    head := head + 1.U
+  }
+  io.in.ready := true.B
+  io.out.valid := outValid
+  io.out.bits := Cat(queue(head).addrNoOffset, 0.U(offsetBits.W))
+
+  /* Update */
+  for(i <- 0 until dQEntries){
+    when(queue(i).cnt.orR){
+      queue(i).cnt := queue(i).cnt - 1.U
+    }
+  }
+
+  /* Perf */
+  XSPerfAccumulate(cacheParams, "full", full)
+  XSPerfAccumulate(cacheParams, "empty", empty)
+  XSPerfAccumulate(cacheParams, "entryNumber", PopCount(valids.asUInt))
+  XSPerfAccumulate(cacheParams, "inNumber", io.in.valid)
+  XSPerfAccumulate(cacheParams, "outNumber", io.out.valid)
+
+}
+
+class VBestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
+  val io = IO(new Bundle() {
+    val train = Flipped(DecoupledIO(new PrefetchTrain))
+    val pbopCrossPage = Input(Bool())
+    val tlb_req = new L2ToL1TlbIO(nRespDups= 1)
+    val req = DecoupledIO(new PrefetchReq)
+    val resp = Flipped(DecoupledIO(new PrefetchResp))
+  })
+
+  val delayQueue = Module(new DelayQueue("vbop"))
+  val rrTable = Module(new RecentRequestTable)
+  val scoreTable = Module(new OffsetScoreTable("vbop"))
+
+  val s0_fire = scoreTable.io.req.fire && io.pbopCrossPage
+  val s1_fire = WireInit(false.B)
+  val s0_ready, s1_ready = WireInit(false.B)
+
+  /* s0 train */
+  val prefetchOffset = scoreTable.io.prefetchOffset
+  val prefetchDisable = scoreTable.io.prefetchDisable
+  // NOTE: vaddr from l1 to l2 has no offset bits
+  val s0_reqVaddr = io.train.bits.vaddr.getOrElse(0.U)
+  val s0_oldFullAddr = if(virtualTrain) Cat(io.train.bits.vaddr.getOrElse(0.U), 0.U(offsetBits.W)) else io.train.bits.addr
+  val s0_oldFullAddrNoOff = s0_oldFullAddr(s0_oldFullAddr.getWidth-1, offsetBits)
+  val s0_newFullAddr = s0_oldFullAddr + signedExtend((prefetchOffset << offsetBits), fullAddrBits)
+  val s0_crossPage = getPPN(s0_newFullAddr) =/= getPPN(s0_oldFullAddr) // unequal tags
+  val respFullAddr = if(virtualTrain) Cat(io.resp.bits.vaddr.getOrElse(0.U), 0.U(offsetBits.W))
+                 else io.resp.bits.addr - signedExtend((prefetchOffset << offsetBits), fullAddrBits)
+
+  rrTable.io.r <> scoreTable.io.test
+  rrTable.io.w <> delayQueue.io.out
+  delayQueue.io.in.valid := io.train.valid
+  delayQueue.io.in.bits := s0_oldFullAddrNoOff
+  scoreTable.io.req.valid := io.train.valid
+  scoreTable.io.req.bits := s0_oldFullAddr
+
+  /* s1 get or send req */
+  val s1_req_valid = RegInit(false.B)
+  val s1_needT = RegEnable(io.train.bits.needT, s0_fire)
+  val s1_source = RegEnable(io.train.bits.source, s0_fire)
+  val s1_newFullAddr = RegEnable(s0_newFullAddr, s0_fire)
+  val s1_reqVaddr = RegEnable(s0_reqVaddr, s0_fire)
+  // val out_req = Wire(new PrefetchReq)
+  // val out_req_valid = Wire(Bool())
+  // val out_drop_req = WireInit(false.B)
+
+  // pipeline control signal
+  when(s0_fire) {
+    if(virtualTrain) s1_req_valid := true.B
+    else s1_req_valid := !s0_crossPage // stop prefetch when prefetch req crosses pages
+  }.elsewhen(s1_fire){
+    s1_req_valid := false.B
+  }
+
+  if (virtualTrain) {
+    // FIXME lyq: it it not correct
+    s0_ready := io.tlb_req.req.ready && s1_ready || !s1_req_valid
+    s1_ready := io.req.ready || !io.req.valid
+    s1_fire := s1_ready && s1_req_valid
+  } else {
+    s0_ready := io.req.ready || !io.req.valid
+    s1_ready := io.req.ready
+    s1_fire := io.req.fire
+  }
+
+  // out value
+  io.train.ready := delayQueue.io.in.ready && scoreTable.io.req.ready && s0_ready
+  io.resp.ready := rrTable.io.w.ready
+  io.tlb_req.resp.ready := true.B
+
+  // different situation
+  val reqFilter = Module(new PrefetchReqBuffer)
+  when(prefetchDisable || !virtualTrain.B){
+    reqFilter.io.in_req.valid := false.B
+    reqFilter.io.in_req.bits := DontCare
+  }.otherwise{
+    reqFilter.io.in_req.valid := s1_req_valid
+    reqFilter.io.in_req.bits.full_vaddr := s1_newFullAddr
+    reqFilter.io.in_req.bits.base_vaddr := s1_reqVaddr
+    reqFilter.io.in_req.bits.needT := s1_needT
+    reqFilter.io.in_req.bits.source := s1_source
+    reqFilter.io.in_req.bits.isBOP := true.B
+  }
+
+  if(virtualTrain){
+    io.tlb_req <> reqFilter.io.tlb_req
+    io.req <> reqFilter.io.out_req
+  } else {
+    io.tlb_req.req.valid := false.B
+    io.tlb_req.req.bits := DontCare
+    io.tlb_req.req_kill := false.B
+
+    /* s1 send prefetch req */
+    io.req.valid := s1_req_valid
+    io.req.bits.tag := parseFullAddress(s1_newFullAddr)._1
+    io.req.bits.set := parseFullAddress(s1_newFullAddr)._2
+    io.req.bits.vaddr.foreach(_ := 0.U)
+    io.req.bits.needT := s1_needT
+    io.req.bits.source := s1_source
+    io.req.bits.pfSource := MemReqSource.Prefetch2L2BOP.id.U
+    io.req.bits.isBOP := true.B
+  }
+
+  for (off <- offsetList) {
+    if (off < 0) {
+      XSPerfAccumulate(cacheParams, "best_offset_neg_" + (-off).toString, prefetchOffset === off.S(offsetWidth.W).asUInt)
+    } else {
+      XSPerfAccumulate(cacheParams, "best_offset_pos_" + off.toString, prefetchOffset === off.U)
+    }
+  }
+  XSPerfAccumulate(cacheParams, "bop_req", io.req.fire)
+  XSPerfAccumulate(cacheParams, "bop_train", io.train.fire)
+  XSPerfAccumulate(cacheParams, "bop_resp", io.resp.fire)
+  XSPerfAccumulate(cacheParams, "bop_train_stall_for_st_not_ready", io.train.valid && !scoreTable.io.req.ready)
+  if(virtualTrain){
+    XSPerfAccumulate(cacheParams, "bop_train_stall_for_tlb_not_ready", io.train.valid && !io.tlb_req.req.ready)
+    // XSPerfAccumulate(cacheParams, "bop_req_drop", out_drop_req)
+  }else{
+    XSPerfAccumulate(cacheParams, "bop_cross_page", scoreTable.io.req.fire && s0_crossPage)
+  }
+  XSPerfAccumulate(cacheParams, "bop_drop_for_disable", scoreTable.io.req.fire && prefetchDisable)
 }
 
-class BestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
+class PBestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
   val io = IO(new Bundle() {
     val train = Flipped(DecoupledIO(new PrefetchTrain))
+    val pbopCrossPage = Output(Bool())
     val req = DecoupledIO(new PrefetchReq)
     val resp = Flipped(DecoupledIO(new PrefetchResp))
   })
 
+  val delayQueue = Module(new DelayQueue("pbop"))
   val rrTable = Module(new RecentRequestTable)
-  val scoreTable = Module(new OffsetScoreTable)
+  val scoreTable = Module(new OffsetScoreTable("pbop"))
 
   val prefetchOffset = scoreTable.io.prefetchOffset
+  val prefetchDisable = scoreTable.io.prefetchDisable
   val oldAddr = io.train.bits.addr
+  val oldAddrNoOff = oldAddr(oldAddr.getWidth-1, offsetBits)
   val newAddr = oldAddr + signedExtend((prefetchOffset << offsetBits), fullAddressBits)
 
   rrTable.io.r <> scoreTable.io.test
-  rrTable.io.w.valid := io.resp.valid
-  rrTable.io.w.bits := Cat(Cat(io.resp.bits.tag, io.resp.bits.set) - signedExtend(prefetchOffset, setBits + fullTagBits), 0.U(offsetBits.W))
+  rrTable.io.w <> delayQueue.io.out
+  delayQueue.io.in.valid := io.train.valid
+  delayQueue.io.in.bits := oldAddrNoOff
   scoreTable.io.req.valid := io.train.valid
   scoreTable.io.req.bits := oldAddr
 
@@ -286,13 +784,14 @@ class BestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
     req.set := parseFullAddress(newAddr)._2
     req.needT := io.train.bits.needT
     req.source := io.train.bits.source
-    req_valid := !crossPage // stop prefetch when prefetch req crosses pages
+    req_valid := !crossPage && !prefetchDisable // stop prefetch when prefetch req crosses pages
   }
 
+  io.pbopCrossPage := crossPage
   io.req.valid := req_valid
   io.req.bits := req
-  io.req.bits.pfSource := MemReqSource.Prefetch2L2BOP.id.U
-  io.train.ready := scoreTable.io.req.ready && (!req_valid || io.req.ready)
+  io.req.bits.pfSource := MemReqSource.Prefetch2L2PBOP.id.U
+  io.train.ready := delayQueue.io.in.ready && scoreTable.io.req.ready && (!req_valid || io.req.ready)
   io.resp.ready := rrTable.io.w.ready
 
   for (off <- offsetList) {
@@ -304,6 +803,8 @@ class BestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
   }
   XSPerfAccumulate(cacheParams, "bop_req", io.req.fire)
   XSPerfAccumulate(cacheParams, "bop_train", io.train.fire)
+  XSPerfAccumulate(cacheParams, "bop_resp", io.resp.fire)
   XSPerfAccumulate(cacheParams, "bop_train_stall_for_st_not_ready", io.train.valid && !scoreTable.io.req.ready)
-  XSPerfAccumulate(cacheParams, "bop_cross_page", scoreTable.io.req.fire && crossPage)
-}
+  XSPerfAccumulate(cacheParams, "bop_drop_for_cross_page", scoreTable.io.req.fire && crossPage)
+  XSPerfAccumulate(cacheParams, "bop_drop_for_disable", scoreTable.io.req.fire && prefetchDisable)
+}
\ No newline at end of file
diff --git a/src/main/scala/coupledL2/prefetch/PrefetchParameters.scala b/src/main/scala/coupledL2/prefetch/PrefetchParameters.scala
index 6490cb2ef..b1610ca20 100644
--- a/src/main/scala/coupledL2/prefetch/PrefetchParameters.scala
+++ b/src/main/scala/coupledL2/prefetch/PrefetchParameters.scala
@@ -40,6 +40,7 @@ object PfSource extends Enumeration {
   val NoWhere = Value("NoWhere")
   val SMS     = Value("SMS")
   val BOP     = Value("BOP")
+  val PBOP     = Value("PBOP")
   val Stream  = Value("Stream")
   val Stride  = Value("Stride")
   val TP      = Value("TP")
@@ -51,6 +52,7 @@ object PfSource extends Enumeration {
     val pfsrc = WireInit(NoWhere.id.U.asTypeOf(UInt(pfSourceBits.W)))
     switch(s) {
       is (MemReqSource.Prefetch2L2BOP.id.U) { pfsrc := BOP.id.U }
+      is (MemReqSource.Prefetch2L2PBOP.id.U) { pfsrc := PBOP.id.U }
       is (MemReqSource.Prefetch2L2SMS.id.U) { pfsrc := SMS.id.U }
       is (MemReqSource.Prefetch2L2TP.id.U)  { pfsrc := TP.id.U  }
       is (MemReqSource.Prefetch2L2Stream.id.U) { pfsrc := Stream.id.U }
diff --git a/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala b/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala
index b149c3aae..92cde9d0e 100644
--- a/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala
+++ b/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala
@@ -42,9 +42,14 @@ class PrefetchReceiver()(implicit p: Parameters) extends PrefetchModule {
 
   io.req.bits.tag := parseFullAddress(io.recv_addr.bits.addr)._1
   io.req.bits.set := parseFullAddress(io.recv_addr.bits.addr)._2
+  io.req.bits.vaddr.foreach(_ := 0.U)
   io.req.bits.needT := false.B
   io.req.bits.source := 0.U // TODO: ensure source 0 is dcache
   io.req.bits.pfSource := io.recv_addr.bits.pfSource
   io.req.valid := io.recv_addr.valid
 
+  io.tlb_req.req.valid := false.B
+  io.tlb_req.req.bits := DontCare
+  io.tlb_req.req_kill := DontCare
+  io.tlb_req.resp.ready := true.B
 }
diff --git a/src/main/scala/coupledL2/prefetch/Prefetcher.scala b/src/main/scala/coupledL2/prefetch/Prefetcher.scala
index 36b07c235..cf08fc616 100644
--- a/src/main/scala/coupledL2/prefetch/Prefetcher.scala
+++ b/src/main/scala/coupledL2/prefetch/Prefetcher.scala
@@ -1,208 +1,417 @@
-/** *************************************************************************************
- * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
- * Copyright (c) 2020-2021 Peng Cheng Laboratory
- *
- * XiangShan is licensed under Mulan PSL v2.
- * You can use this software according to the terms and conditions of the Mulan PSL v2.
- * You may obtain a copy of Mulan PSL v2 at:
- * http://license.coscl.org.cn/MulanPSL2
- *
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
- * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
- * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
- *
- * See the Mulan PSL v2 for more details.
- * *************************************************************************************
- */
-
-package coupledL2.prefetch
-
-import chisel3._
-import chisel3.util._
-import utility._
-import org.chipsalliance.cde.config.Parameters
-import freechips.rocketchip.tilelink._
-import coupledL2._
-import coupledL2.utils.{XSPerfAccumulate, XSPerfHistogram}
-
-class PrefetchReq(implicit p: Parameters) extends PrefetchBundle {
-  val tag = UInt(fullTagBits.W)
-  val set = UInt(setBits.W)
-  val needT = Bool()
-  val source = UInt(sourceIdBits.W)
-  val pfSource = UInt(MemReqSource.reqSourceBits.W)
-
-  def isBOP:Bool = pfSource === MemReqSource.Prefetch2L2BOP.id.U
-  def isSMS:Bool = pfSource === MemReqSource.Prefetch2L2SMS.id.U
-  def isTP:Bool = pfSource === MemReqSource.Prefetch2L2TP.id.U
-  def fromL2:Bool =
-    pfSource === MemReqSource.Prefetch2L2BOP.id.U ||
-    pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
-    pfSource === MemReqSource.Prefetch2L2TP.id.U
-}
-
-class PrefetchResp(implicit p: Parameters) extends PrefetchBundle {
-  // val id = UInt(sourceIdBits.W)
-  val tag = UInt(fullTagBits.W)
-  val set = UInt(setBits.W)
-  def addr = Cat(tag, set, 0.U(offsetBits.W))
-}
-
-class PrefetchTrain(implicit p: Parameters) extends PrefetchBundle {
-  val tag = UInt(fullTagBits.W)
-  val set = UInt(setBits.W)
-  val needT = Bool()
-  val source = UInt(sourceIdBits.W)
-  val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
-  val hit = Bool()
-  val prefetched = Bool()
-  val pfsource = UInt(PfSource.pfSourceBits.W)
-  val reqsource = UInt(MemReqSource.reqSourceBits.W)
-
-  def addr: UInt = Cat(tag, set, 0.U(offsetBits.W))
-}
-
-class PrefetchIO(implicit p: Parameters) extends PrefetchBundle {
-  val train = Flipped(DecoupledIO(new PrefetchTrain))
-  val req = DecoupledIO(new PrefetchReq)
-  val resp = Flipped(DecoupledIO(new PrefetchResp))
-  val recv_addr = Flipped(ValidIO(new Bundle() {
-    val addr = UInt(64.W)
-    val pfSource = UInt(MemReqSource.reqSourceBits.W)
-  }))
-}
-
-class PrefetchQueue(implicit p: Parameters) extends PrefetchModule {
-  val io = IO(new Bundle {
-    val enq = Flipped(DecoupledIO(new PrefetchReq))
-    val deq = DecoupledIO(new PrefetchReq)
-  })
-  /*  Here we implement a queue that
-   *  1. is pipelined  2. flows
-   *  3. always has the latest reqs, which means the queue is always ready for enq and deserting the eldest ones
-   */
-  val queue = RegInit(VecInit(Seq.fill(inflightEntries)(0.U.asTypeOf(new PrefetchReq))))
-  val valids = RegInit(VecInit(Seq.fill(inflightEntries)(false.B)))
-  val idxWidth = log2Up(inflightEntries)
-  val head = RegInit(0.U(idxWidth.W))
-  val tail = RegInit(0.U(idxWidth.W))
-  val empty = head === tail && !valids.last
-  val full = head === tail && valids.last
-
-  when(!empty && io.deq.ready) {
-    valids(head) := false.B
-    head := head + 1.U
-  }
-
-  when(io.enq.valid) {
-    queue(tail) := io.enq.bits
-    valids(tail) := !empty || !io.deq.ready // true.B
-    tail := tail + (!empty || !io.deq.ready).asUInt
-    when(full && !io.deq.ready) {
-      head := head + 1.U
-    }
-  }
-
-  io.enq.ready := true.B
-  io.deq.valid := !empty || io.enq.valid
-  io.deq.bits := Mux(empty, io.enq.bits, queue(head))
-
-  // The reqs that are discarded = enq - deq
-  XSPerfAccumulate(cacheParams, "prefetch_queue_enq",         io.enq.fire)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromBOP", io.enq.fire && io.enq.bits.isBOP)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromSMS", io.enq.fire && io.enq.bits.isSMS)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromTP",  io.enq.fire && io.enq.bits.isTP)
-
-  XSPerfAccumulate(cacheParams, "prefetch_queue_deq",         io.deq.fire)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromBOP", io.deq.fire && io.deq.bits.isBOP)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromSMS", io.deq.fire && io.deq.bits.isSMS)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromTP",  io.deq.fire && io.deq.bits.isTP)
-
-  XSPerfHistogram(cacheParams, "prefetch_queue_entry", PopCount(valids.asUInt),
-    true.B, 0, inflightEntries, 1)
-}
-
-class Prefetcher(implicit p: Parameters) extends PrefetchModule {
-  val io = IO(new PrefetchIO)
-  val tpio = IO(new Bundle() {
-    val tpmeta_port = prefetchOpt match {
-      case Some(param: PrefetchReceiverParams) =>
-        if (param.hasTPPrefetcher) Some(new tpmetaPortIO()) else None
-      case _ => None
-    }
-  })
-  val hartId = IO(Input(UInt(hartIdLen.W)))
-
-  /* io_l2_pf_en:
-   * chicken bits for whether L2 prefetchers are enabled
-   * it will control BOP and TP prefetchers
-   */
-  val io_l2_pf_en = IO(Input(Bool()))
-
-  prefetchOpt.get match {
-    case bop: BOPParameters =>
-      val pft = Module(new BestOffsetPrefetch)
-      val pftQueue = Module(new PrefetchQueue)
-      val pipe = Module(new Pipeline(io.req.bits.cloneType, 1))
-      pft.io.train <> io.train
-      pft.io.resp <> io.resp
-      pftQueue.io.enq <> pft.io.req
-      pipe.io.in <> pftQueue.io.deq
-      io.req <> pipe.io.out
-    case receiver: PrefetchReceiverParams =>
-      val pfRcv = Module(new PrefetchReceiver())
-      val bop = Module(new BestOffsetPrefetch()(p.alterPartial({
-        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(BOPParameters()))
-      })))
-      val tp = Module(new TemporalPrefetch()(p.alterPartial({
-        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(TPParameters()))
-      })))
-      val pftQueue = Module(new PrefetchQueue)
-      val pipe = Module(new Pipeline(io.req.bits.cloneType, 1))
-      val l2_pf_en = RegNextN(io_l2_pf_en, 2, Some(true.B))
-
-      // prefetch from upper level
-      pfRcv.io.recv_addr := ValidIODelay(io.recv_addr, 2)
-      pfRcv.io.train.valid := false.B
-      pfRcv.io.train.bits := 0.U.asTypeOf(new PrefetchTrain)
-      pfRcv.io.resp.valid := false.B
-      pfRcv.io.resp.bits := 0.U.asTypeOf(new PrefetchResp)
-      assert(!pfRcv.io.req.valid ||
-       pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
-       pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2Stream.id.U ||
-       pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2Stride.id.U
-      )
-
-      // prefetch from local prefetchers: BOP & TP
-      bop.io.train <> io.train
-      bop.io.resp <> io.resp
-      tp.io.train <> io.train
-      tp.io.resp <> io.resp
-      tp.io.hartid := hartId
-
-      // send to prq
-      pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (bop.io.req.valid || tp.io.req.valid))
-      pftQueue.io.enq.bits := Mux(pfRcv.io.req.valid,
-        pfRcv.io.req.bits,
-        Mux(bop.io.req.valid,
-          bop.io.req.bits,
-          tp.io.req.bits
-        )
-      )
-      pfRcv.io.req.ready := true.B
-      bop.io.req.ready := true.B
-      tp.io.req.ready := !pfRcv.io.req.valid && !bop.io.req.valid
-      pipe.io.in <> pftQueue.io.deq
-      io.req <> pipe.io.out
-
-      // tpmeta interface
-      tp.io.tpmeta_port <> tpio.tpmeta_port.get
-
-      XSPerfAccumulate(cacheParams, "prefetch_req_fromSMS", pfRcv.io.req.valid)
-      XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && bop.io.req.valid)
-      XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
-      XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
-        pfRcv.io.req.valid && l2_pf_en && (bop.io.req.valid || tp.io.req.valid))
-    case _ => assert(cond = false, "Unknown prefetcher")
-  }
-}
+/** *************************************************************************************
+  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
+  * Copyright (c) 2020-2021 Peng Cheng Laboratory
+  *
+  * XiangShan is licensed under Mulan PSL v2.
+  * You can use this software according to the terms and conditions of the Mulan PSL v2.
+  * You may obtain a copy of Mulan PSL v2 at:
+  * http://license.coscl.org.cn/MulanPSL2
+  *
+  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+  *
+  * See the Mulan PSL v2 for more details.
+  * *************************************************************************************
+  */
+
+package coupledL2.prefetch
+
+import chisel3._
+import chisel3.util._
+import utility._
+import org.chipsalliance.cde.config.Parameters
+import freechips.rocketchip.tilelink._
+import coupledL2._
+import coupledL2.utils.{XSPerfAccumulate, XSPerfHistogram}
+
+/* virtual address */
+trait HasPrefetcherHelper extends HasCircularQueuePtrHelper with HasCoupledL2Parameters {
+  // filter
+  val TRAIN_FILTER_SIZE = 4
+  val REQ_FILTER_SIZE = 16
+  val TLB_REPLAY_CNT = 10
+
+  // parameters
+  val BLK_ADDR_RAW_WIDTH = 10
+  val REGION_SIZE = 1024
+  val PAGE_OFFSET = pageOffsetBits
+  val VADDR_HASH_WIDTH = 5
+
+  // vaddr:
+  // |       tag               |     index     |    offset    |
+  // |       block addr                        | block offset |
+  // |       region addr       |        region offset         |
+  val BLOCK_OFFSET = offsetBits
+  val REGION_OFFSET = log2Up(REGION_SIZE)
+  val REGION_BLKS = REGION_SIZE / blockBytes
+  val INDEX_BITS = log2Up(REGION_BLKS)
+  val TAG_BITS = fullVAddrBits - REGION_OFFSET
+  val PTAG_BITS = fullAddressBits - REGION_OFFSET
+  val BLOCK_ADDR_BITS = fullVAddrBits - BLOCK_OFFSET
+
+  // hash related
+  val HASH_TAG_WIDTH = VADDR_HASH_WIDTH + BLK_ADDR_RAW_WIDTH
+
+  def get_tag(vaddr: UInt) = {
+    require(vaddr.getWidth == fullVAddrBits)
+    vaddr(vaddr.getWidth - 1, REGION_OFFSET)
+  }
+
+  def get_ptag(vaddr: UInt) = {
+    require(vaddr.getWidth == fullAddressBits)
+    vaddr(vaddr.getWidth - 1, REGION_OFFSET)
+  }
+
+  def get_index(addr: UInt) = {
+    require(addr.getWidth >= REGION_OFFSET)
+    addr(REGION_OFFSET - 1, BLOCK_OFFSET)
+  }
+
+  def get_index_oh(vaddr: UInt): UInt = {
+    UIntToOH(get_index(vaddr))
+  }
+
+  def get_block_vaddr(vaddr: UInt): UInt = {
+    vaddr(vaddr.getWidth - 1, BLOCK_OFFSET)
+  }
+
+  def _vaddr_hash(x: UInt): UInt = {
+    val width = VADDR_HASH_WIDTH
+    val low = x(width - 1, 0)
+    val mid = x(2 * width - 1, width)
+    val high = x(3 * width - 1, 2 * width)
+    low ^ mid ^ high
+  }
+
+  def block_hash_tag(vaddr: UInt): UInt = {
+    val blk_addr = get_block_vaddr(vaddr)
+    val low = blk_addr(BLK_ADDR_RAW_WIDTH - 1, 0)
+    val high = blk_addr(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH)
+    val high_hash = _vaddr_hash(high)
+    Cat(high_hash, low)
+  }
+
+  def region_hash_tag(vaddr: UInt): UInt = {
+    val region_tag = get_tag(vaddr)
+    val low = region_tag(BLK_ADDR_RAW_WIDTH - 1, 0)
+    val high = region_tag(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH)
+    val high_hash = _vaddr_hash(high)
+    Cat(high_hash, low)
+  }
+
+  def region_to_block_addr(tag: UInt, index: UInt): UInt = {
+    Cat(tag, index)
+  }
+
+  def toBinary(n: Int): String = n match {
+    case 0 | 1 => s"$n"
+    case _ => s"${toBinary(n / 2)}${n % 2}"
+  }
+}
+
+class PrefetchReq(implicit p: Parameters) extends PrefetchBundle {
+  val tag = UInt(fullTagBits.W)
+  val set = UInt(setBits.W)
+  val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
+  val needT = Bool()
+  val source = UInt(sourceIdBits.W)
+  val pfSource = UInt(MemReqSource.reqSourceBits.W)
+
+  def isBOP:Bool = pfSource === MemReqSource.Prefetch2L2BOP.id.U
+  def isPBOP:Bool = pfSource === MemReqSource.Prefetch2L2PBOP.id.U
+  def isSMS:Bool = pfSource === MemReqSource.Prefetch2L2SMS.id.U
+  def isTP:Bool = pfSource === MemReqSource.Prefetch2L2TP.id.U
+  def needAck:Bool = pfSource === MemReqSource.Prefetch2L2BOP.id.U || pfSource === MemReqSource.Prefetch2L2PBOP.id.U
+  def fromL2:Bool =
+    pfSource === MemReqSource.Prefetch2L2BOP.id.U ||
+      pfSource === MemReqSource.Prefetch2L2PBOP.id.U ||
+      pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
+      pfSource === MemReqSource.Prefetch2L2TP.id.U
+}
+
+class PrefetchResp(implicit p: Parameters) extends PrefetchBundle {
+  // val id = UInt(sourceIdBits.W)
+  val tag = UInt(fullTagBits.W)
+  val set = UInt(setBits.W)
+  val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
+  val pfSource = UInt(MemReqSource.reqSourceBits.W)
+
+  def addr = Cat(tag, set, 0.U(offsetBits.W))
+  def isBOP: Bool = pfSource === MemReqSource.Prefetch2L2BOP.id.U
+  def isPBOP: Bool = pfSource === MemReqSource.Prefetch2L2PBOP.id.U
+  def isSMS: Bool = pfSource === MemReqSource.Prefetch2L2SMS.id.U
+  def isTP: Bool = pfSource === MemReqSource.Prefetch2L2TP.id.U
+  def fromL2: Bool =
+    pfSource === MemReqSource.Prefetch2L2BOP.id.U ||
+      pfSource === MemReqSource.Prefetch2L2PBOP.id.U ||
+      pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
+      pfSource === MemReqSource.Prefetch2L2TP.id.U
+}
+
+class PrefetchTrain(implicit p: Parameters) extends PrefetchBundle {
+  val tag = UInt(fullTagBits.W)
+  val set = UInt(setBits.W)
+  val needT = Bool()
+  val source = UInt(sourceIdBits.W)
+  val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
+  val hit = Bool()
+  val prefetched = Bool()
+  val pfsource = UInt(PfSource.pfSourceBits.W)
+  val reqsource = UInt(MemReqSource.reqSourceBits.W)
+
+  def addr: UInt = Cat(tag, set, 0.U(offsetBits.W))
+}
+
+class PrefetchIO(implicit p: Parameters) extends PrefetchBundle {
+  val train = Flipped(DecoupledIO(new PrefetchTrain))
+  val tlb_req = new L2ToL1TlbIO(nRespDups= 1)
+  val req = DecoupledIO(new PrefetchReq)
+  val resp = Flipped(DecoupledIO(new PrefetchResp))
+  val recv_addr = Flipped(ValidIO(new Bundle() {
+    val addr = UInt(64.W)
+    val pfSource = UInt(MemReqSource.reqSourceBits.W)
+  }))
+}
+
+class PrefetchQueue(implicit p: Parameters) extends PrefetchModule {
+  val io = IO(new Bundle {
+    val enq = Flipped(DecoupledIO(new PrefetchReq))
+    val deq = DecoupledIO(new PrefetchReq)
+  })
+  /*  Here we implement a queue that
+   *  1. is pipelined  2. flows
+   *  3. always has the latest reqs, which means the queue is always ready for enq and deserting the eldest ones
+   */
+  val queue = RegInit(VecInit(Seq.fill(inflightEntries)(0.U.asTypeOf(new PrefetchReq))))
+  val valids = RegInit(VecInit(Seq.fill(inflightEntries)(false.B)))
+  val idxWidth = log2Up(inflightEntries)
+  val head = RegInit(0.U(idxWidth.W))
+  val tail = RegInit(0.U(idxWidth.W))
+  val empty = head === tail && !valids.last
+  val full = head === tail && valids.last
+
+  when(!empty && io.deq.ready) {
+    valids(head) := false.B
+    head := head + 1.U
+  }
+
+  when(io.enq.valid) {
+    queue(tail) := io.enq.bits
+    valids(tail) := !empty || !io.deq.ready // true.B
+    tail := tail + (!empty || !io.deq.ready).asUInt
+    when(full && !io.deq.ready) {
+      head := head + 1.U
+    }
+  }
+
+  io.enq.ready := true.B
+  io.deq.valid := !empty || io.enq.valid
+  io.deq.bits := Mux(empty, io.enq.bits, queue(head))
+
+  // The reqs that are discarded = enq - deq
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq",         io.enq.fire)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromBOP", io.enq.fire && io.enq.bits.isBOP)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromPBOP", io.enq.fire && io.enq.bits.isPBOP)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromSMS", io.enq.fire && io.enq.bits.isSMS)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromTP",  io.enq.fire && io.enq.bits.isTP)
+
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq",         io.deq.fire)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromBOP", io.deq.fire && io.deq.bits.isBOP)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromPBOP", io.deq.fire && io.deq.bits.isPBOP)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromSMS", io.deq.fire && io.deq.bits.isSMS)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromTP",  io.deq.fire && io.deq.bits.isTP)
+
+  XSPerfHistogram(cacheParams, "prefetch_queue_entry", PopCount(valids.asUInt),
+    true.B, 0, inflightEntries, 1)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_empty", empty)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_full", full)
+}
+
+class Prefetcher(implicit p: Parameters) extends PrefetchModule {
+  val io = IO(new PrefetchIO)
+  val tpio = IO(new Bundle() {
+    val tpmeta_port = prefetchOpt match {
+      case Some(param: PrefetchReceiverParams) =>
+        if (param.hasTPPrefetcher) Some(new tpmetaPortIO()) else None
+      case _ => None
+    }
+  })
+  val hartId = IO(Input(UInt(hartIdLen.W)))
+
+  /* io_l2_pf_en:
+   * chicken bits for whether L2 prefetchers are enabled
+   * it will control BOP and TP prefetchers
+   */
+  val io_l2_pf_en = IO(Input(Bool()))
+
+  prefetchOpt.get match {
+    case bop: BOPParameters =>
+      val pft = Module(new VBestOffsetPrefetch)
+      val pftQueue = Module(new PrefetchQueue)
+      val pipe = Module(new Pipeline(io.req.bits.cloneType, 1))
+      pft.io.train <> io.train
+      pft.io.resp <> io.resp
+      pft.io.tlb_req <> io.tlb_req
+      pftQueue.io.enq <> pft.io.req
+      pipe.io.in <> pftQueue.io.deq
+      io.req <> pipe.io.out
+    case receiver: PrefetchReceiverParams =>
+      val pfRcv = Module(new PrefetchReceiver())
+      val pbop = Module(new PBestOffsetPrefetch()(p.alterPartial({
+        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(BOPParameters(
+          virtualTrain = false,
+          badScore = 1,
+          offsetList = Seq(
+            -32, -30, -27, -25, -24, -20, -18, -16, -15,
+            -12, -10, -9, -8, -6, -5, -4, -3, -2, -1,
+            1, 2, 3, 4, 5, 6, 8, 9, 10,
+            12, 15, 16, 18, 20, 24, 25, 27, 30
+          ))))
+      })))
+      val vbop = Module(new VBestOffsetPrefetch()(p.alterPartial({
+        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(BOPParameters(
+          badScore = 2,
+          offsetList = Seq(
+            -117,-147,-91,117,147,91,
+            -256, -250, -243, -240, -225, -216, -200,
+            -192, -180, -162, -160, -150, -144, -135, -128,
+            -125, -120, -108, -100, -96, -90, -81, -80,
+            -75, -72, -64, -60, -54, -50, -48, -45,
+            -40, -36, -32, -30, -27, -25, -24, -20,
+            -18, -16, -15, -12, -10, -9, -8, -6,
+            -5, -4, -3, -2, -1,
+            1, 2, 3, 4, 5, 6, 8,
+            9, 10, 12, 15, 16, 18, 20, 24,
+            25, 27, 30, 32, 36, 40, 45, 48,
+            50, 54, 60, 64, 72, 75, 80, 81,
+            90, 96, 100, 108, 120, 125, 128, 135,
+            144, 150, 160, 162, 180, 192, 200, 216,
+            225, 240, 243, 250/*, 256*/
+          )
+        )))
+      })))
+      val tp = Module(new TemporalPrefetch()(p.alterPartial({
+        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(TPParameters()))
+      })))
+      val pftQueue = Module(new PrefetchQueue)
+      val pipe = Module(new Pipeline(io.req.bits.cloneType, 1))
+      val l2_pf_en = RegNextN(io_l2_pf_en, 2, Some(true.B))
+
+      // prefetch from upper level
+      pfRcv.io.recv_addr := ValidIODelay(io.recv_addr, 2)
+      pfRcv.io.train.valid := false.B
+      pfRcv.io.train.bits := 0.U.asTypeOf(new PrefetchTrain)
+      pfRcv.io.resp.valid := false.B
+      pfRcv.io.resp.bits := 0.U.asTypeOf(new PrefetchResp)
+      pfRcv.io.tlb_req.req.ready := true.B
+      pfRcv.io.tlb_req.resp.valid := false.B
+      pfRcv.io.tlb_req.resp.bits := DontCare
+      assert(!pfRcv.io.req.valid ||
+        pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
+        pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2Stream.id.U ||
+        pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2Stride.id.U
+      )
+
+      // prefetch from local prefetchers: BOP & TP
+      vbop.io.train <> io.train
+      vbop.io.train.valid := io.train.valid && (io.train.bits.reqsource =/= MemReqSource.L1DataPrefetch.id.U)
+      vbop.io.resp <> io.resp
+      vbop.io.resp.valid := io.resp.valid && io.resp.bits.isBOP
+      vbop.io.tlb_req <> io.tlb_req
+      vbop.io.pbopCrossPage := true.B // pbop.io.pbopCrossPage // let vbop have noting to do with pbop
+
+      pbop.io.train <> io.train
+      pbop.io.train.valid := io.train.valid && (io.train.bits.reqsource =/= MemReqSource.L1DataPrefetch.id.U)
+      pbop.io.resp <> io.resp
+      pbop.io.resp.valid := io.resp.valid && io.resp.bits.isPBOP
+      tp.io.train <> io.train
+      tp.io.resp <> io.resp
+      tp.io.hartid := hartId
+
+      pfRcv.io.req.ready := true.B
+      vbop.io.req.ready := true.B
+      pbop.io.req.ready := true.B
+      tp.io.req.ready := !pfRcv.io.req.valid && !vbop.io.req.valid
+      pipe.io.in <> pftQueue.io.deq
+      io.req <> pipe.io.out
+
+      // tpmeta interface
+      tp.io.tpmeta_port <> tpio.tpmeta_port.get
+
+      /* pri vbop */
+      pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || tp.io.req.valid))
+      pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
+        pfRcv.io.req.valid -> pfRcv.io.req.bits,
+        vbop.io.req.valid -> vbop.io.req.bits,
+        pbop.io.req.valid -> pbop.io.req.bits,
+        tp.io.req.valid -> tp.io.req.bits
+      ))
+      XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && vbop.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && pbop.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
+        pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+
+      /* pri pbop */
+      // pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || tp.io.req.valid))
+      // pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
+      //   pfRcv.io.req.valid -> pfRcv.io.req.bits,
+      //   pbop.io.req.valid -> pbop.io.req.bits,
+      //   vbop.io.req.valid -> vbop.io.req.bits,
+      //   tp.io.req.valid -> tp.io.req.bits
+      // ))
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && pbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && !pbop.io.req.valid && vbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
+      //   pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+
+      /* solo vbop */
+      //  vbop.io.pbopCrossPage := true.B
+      //  pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+      //  pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
+      //    pfRcv.io.req.valid -> pfRcv.io.req.bits,
+      //    vbop.io.req.valid -> vbop.io.req.bits,
+      //    tp.io.req.valid -> tp.io.req.bits
+      //  ))
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && vbop.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && tp.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
+      //    pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+
+      /* solo pbop */
+      // vbop.io.train.valid := false.B
+      // vbop.io.resp.valid := false.B
+      // pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (pbop.io.req.valid || tp.io.req.valid))
+      // pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
+      //   pfRcv.io.req.valid -> pfRcv.io.req.bits,
+      //   pbop.io.req.valid -> pbop.io.req.bits,
+      //   tp.io.req.valid -> tp.io.req.bits
+      // ))
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && pbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !pbop.io.req.valid && tp.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
+      //   pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+
+    case _ => assert(cond = false, "Unknown prefetcher")
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala b/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala
index b0f3d181c..ff8c5e4fb 100644
--- a/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala
+++ b/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala
@@ -344,6 +344,7 @@ class TemporalPrefetch(implicit p: Parameters) extends TPModule {
   io.req.valid := Mux(enableTP.orR, sending_valid, false.B)
   io.req.bits.tag := sendingTag
   io.req.bits.set := sendingSet
+  io.req.bits.vaddr.foreach(_ := 0.U)
   io.req.bits.needT := true.B
   io.req.bits.source := 0.U // TODO: ensure source 0 is dcache
   io.req.bits.pfSource := MemReqSource.Prefetch2L2TP.id.U
diff --git a/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala b/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala
index ab4e4fda8..10dd6fef2 100644
--- a/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala
+++ b/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala
@@ -190,6 +190,39 @@ object XSPerfRolling {
       rollingTable.log(rollingPt, triggerDB, "", clock, reset)
     }
   }
+
+  // event interval based mode
+  def apply(
+    params: L2Param,
+    perfName: String,
+    perfCntX: UInt,
+    perfCntY: UInt,
+    granularity: Int,
+    eventTrigger: UInt,
+    clock: Clock,
+    reset: Reset
+  ): Unit = {
+    if (params.enablePerf && !params.FPGAPlatform) {
+      val tableName = perfName + "_rolling_0"
+      val rollingTable = ChiselDB.createTable(tableName, new RollingEntry(), basicDB = true)
+
+      val xAxisCnt = RegInit(0.U(64.W))
+      val yAxisCnt = RegInit(0.U(64.W))
+      val eventCnt = RegInit(0.U(64.W))
+      xAxisCnt := xAxisCnt + perfCntX
+      yAxisCnt := yAxisCnt + perfCntY
+      eventCnt := eventCnt + eventTrigger
+
+      val triggerDB = eventCnt >= granularity.U
+      when(triggerDB) {
+        eventCnt := eventTrigger
+        xAxisCnt := perfCntX
+        yAxisCnt := perfCntY
+      }
+      val rollingPt = new RollingEntry().apply(xAxisCnt, yAxisCnt)
+      rollingTable.log(rollingPt, triggerDB, "", clock, reset)
+    }
+  }
 }
 
 object TransactionLatencyCounter {
diff --git a/src/test/scala/TestTop.scala b/src/test/scala/TestTop.scala
index 1ae89baf7..696cbc72e 100644
--- a/src/test/scala/TestTop.scala
+++ b/src/test/scala/TestTop.scala
@@ -394,6 +394,7 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule {
       case l2 => {
         l2.module.io.debugTopDown := DontCare
         l2.module.io.hartId := DontCare
+        l2.module.io.l2_tlb_req <> DontCare
       }
     }
 
diff --git a/utility b/utility
index 1b7acf099..92b7cfbba 160000
--- a/utility
+++ b/utility
@@ -1 +1 @@
-Subproject commit 1b7acf0998ddf175527aa0609788c3fea1262b1f
+Subproject commit 92b7cfbbaacfda6c9ff691e12c48421d9f6d0f99

From e9c94b9ab9bad88b586d635d843831c9b14d13b9 Mon Sep 17 00:00:00 2001
From: zhanglinjuan <zhanglinjuan16@mails.ucas.ac.cn>
Date: Tue, 7 May 2024 20:42:58 +0800
Subject: [PATCH 8/9] Merge branch 'master' into chi-coupledl2-merge-master

---
 src/main/scala/coupledL2/Common.scala         |  45 ++
 src/main/scala/coupledL2/CoupledL2.scala      |   3 +
 src/main/scala/coupledL2/GrantBuffer.scala    |   6 +
 src/main/scala/coupledL2/SinkA.scala          |   8 +-
 src/main/scala/coupledL2/TopDownMonitor.scala |  49 +-
 .../prefetch/BestOffsetPrefetch.scala         | 571 +++++++++++++++-
 .../prefetch/PrefetchParameters.scala         |   2 +
 .../coupledL2/prefetch/PrefetchReceiver.scala |   5 +
 .../scala/coupledL2/prefetch/Prefetcher.scala | 628 ++++++++++++------
 .../coupledL2/prefetch/TemporalPrefetch.scala |   1 +
 .../coupledL2/tl2chi/TL2CHICoupledL2.scala    | 163 +++--
 src/main/scala/coupledL2/tl2tl/MSHR.scala     |   6 +-
 src/main/scala/coupledL2/tl2tl/Slice.scala    |   3 +
 .../coupledL2/tl2tl/TL2TLCoupledL2.scala      | 125 +++-
 .../coupledL2/utils/L2PerfCounterUtils.scala  |  33 +
 src/test/scala/TestTop.scala                  |   1 +
 src/test/scala/chi/TestTop.scala              |   1 +
 utility                                       |   2 +-
 18 files changed, 1294 insertions(+), 358 deletions(-)

diff --git a/src/main/scala/coupledL2/Common.scala b/src/main/scala/coupledL2/Common.scala
index a9393dac6..60e68bca0 100644
--- a/src/main/scala/coupledL2/Common.scala
+++ b/src/main/scala/coupledL2/Common.scala
@@ -316,3 +316,48 @@ class L2ToL1Hint(implicit p: Parameters) extends Bundle {
   val sourceId = UInt(32.W)    // tilelink sourceID
   val isKeyword = Bool()       // miss entry keyword
 }
+
+// custom l2 - l1 tlb
+// FIXME lyq: Tlbcmd and TlbExceptionBundle, how to use L1 corresponding bundles?
+object TlbCmd {
+  def read  = "b00".U
+  def write = "b01".U
+  def exec  = "b10".U
+
+  def atom_read  = "b100".U // lr
+  def atom_write = "b101".U // sc / amo
+
+  def apply() = UInt(3.W)
+  def isRead(a: UInt) = a(1,0)===read
+  def isWrite(a: UInt) = a(1,0)===write
+  def isExec(a: UInt) = a(1,0)===exec
+
+  def isAtom(a: UInt) = a(2)
+  def isAmo(a: UInt) = a===atom_write // NOTE: sc mixed
+}
+class TlbExceptionBundle extends Bundle {
+  val ld = Output(Bool())
+  val st = Output(Bool())
+  val instr = Output(Bool())
+}
+class L2TlbReq(implicit p: Parameters) extends L2Bundle{
+  val vaddr = Output(UInt((fullVAddrBits+offsetBits).W))
+  val cmd = Output(TlbCmd())
+  val size = Output(UInt(log2Ceil(log2Ceil(XLEN/8) + 1).W))
+  val kill = Output(Bool()) // Use for blocked tlb that need sync with other module like icache
+  val no_translate = Output(Bool()) // do not translate, but still do pmp/pma check
+}
+class L2TlbResp(nDups: Int = 1)(implicit p: Parameters) extends L2Bundle {
+  val paddr = Vec(nDups, Output(UInt(fullAddressBits.W)))
+  val miss = Output(Bool())
+  val excp = Vec(nDups, new Bundle {
+    val gpf = new TlbExceptionBundle()
+    val pf = new TlbExceptionBundle()
+    val af = new TlbExceptionBundle()
+  })
+}
+class L2ToL1TlbIO(nRespDups: Int = 1)(implicit p: Parameters) extends L2Bundle{
+  val req = DecoupledIO(new L2TlbReq)
+  val req_kill = Output(Bool())
+  val resp = Flipped(DecoupledIO(new L2TlbResp(nRespDups)))
+}
diff --git a/src/main/scala/coupledL2/CoupledL2.scala b/src/main/scala/coupledL2/CoupledL2.scala
index 38b099093..25a7ca062 100644
--- a/src/main/scala/coupledL2/CoupledL2.scala
+++ b/src/main/scala/coupledL2/CoupledL2.scala
@@ -38,6 +38,7 @@ trait HasCoupledL2Parameters {
   val enableCHI = p(EnableCHI)
   val cacheParams = p(L2ParamKey)
 
+  val XLEN = 64
   val blocks = cacheParams.sets * cacheParams.ways
   val blockBytes = cacheParams.blockBytes
   val beatBytes = cacheParams.channelBytes.d.get
@@ -51,8 +52,10 @@ trait HasCoupledL2Parameters {
   val chiOpt = if (enableCHI) Some(true) else None
   val aliasBitsOpt = if(cacheParams.clientCaches.isEmpty) None
                   else cacheParams.clientCaches.head.aliasBitsOpt
+  // vaddr without offset bits
   val vaddrBitsOpt = if(cacheParams.clientCaches.isEmpty) None
                   else cacheParams.clientCaches.head.vaddrBitsOpt
+  val fullVAddrBits = vaddrBitsOpt.getOrElse(0) + offsetBits
   // from L1 load miss cache require
   val isKeywordBitsOpt = if(cacheParams.clientCaches.isEmpty) None
                   else cacheParams.clientCaches.head.isKeywordBitsOpt
diff --git a/src/main/scala/coupledL2/GrantBuffer.scala b/src/main/scala/coupledL2/GrantBuffer.scala
index ccf0d5c5f..8d14e79da 100644
--- a/src/main/scala/coupledL2/GrantBuffer.scala
+++ b/src/main/scala/coupledL2/GrantBuffer.scala
@@ -216,6 +216,8 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
   val pftRespEntry = new Bundle() {
     val tag = UInt(tagBits.W)
     val set = UInt(setBits.W)
+    val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
+    val pfSource = UInt(MemReqSource.reqSourceBits.W)
   }
   // TODO: this may not need 10 entries, but this does not take much space
   val pftQueueLen = 10
@@ -225,11 +227,15 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
       io.d_task.bits.task.fromL2pft.getOrElse(false.B)
     pftRespQueue.get.io.enq.bits.tag := io.d_task.bits.task.tag
     pftRespQueue.get.io.enq.bits.set := io.d_task.bits.task.set
+    pftRespQueue.get.io.enq.bits.vaddr.foreach(_ := io.d_task.bits.task.vaddr.getOrElse(0.U))
+    pftRespQueue.get.io.enq.bits.pfSource := io.d_task.bits.task.reqSource
 
     val resp = io.prefetchResp.get
     resp.valid := pftRespQueue.get.io.deq.valid
     resp.bits.tag := pftRespQueue.get.io.deq.bits.tag
     resp.bits.set := pftRespQueue.get.io.deq.bits.set
+    resp.bits.vaddr.foreach(_ := pftRespQueue.get.io.deq.bits.vaddr.getOrElse(0.U))
+    resp.bits.pfSource := pftRespQueue.get.io.deq.bits.pfSource
     pftRespQueue.get.io.deq.ready := resp.ready
 
     assert(pftRespQueue.get.io.enq.ready, "pftRespQueue should never be full, no back pressure logic")
diff --git a/src/main/scala/coupledL2/SinkA.scala b/src/main/scala/coupledL2/SinkA.scala
index bd6ab58c2..457640608 100644
--- a/src/main/scala/coupledL2/SinkA.scala
+++ b/src/main/scala/coupledL2/SinkA.scala
@@ -93,8 +93,8 @@ class SinkA(implicit p: Parameters) extends L2Module {
     task.mshrId := 0.U(mshrBits.W)
     task.aliasTask.foreach(_ := false.B)
     task.useProbeData := false.B
+    task.fromL2pft.foreach(_ := req.needAck)
     task.mshrRetry := false.B
-    task.fromL2pft.foreach(_ := req.isBOP)
     task.needHint.foreach(_ := false.B)
     task.dirty := false.B
     task.way := 0.U(wayBits.W)
@@ -105,7 +105,7 @@ class SinkA(implicit p: Parameters) extends L2Module {
     task.wayMask := 0.U(cacheParams.ways.W)
     task.reqSource := req.pfSource
     task.replTask := false.B
-    task.vaddr.foreach(_ := 0.U)
+    task.vaddr.foreach(_ := req.vaddr.getOrElse(0.U))
     task.isKeyword.foreach(_ := false.B)
     task.mergeA := false.B
     task.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle)
@@ -137,8 +137,8 @@ class SinkA(implicit p: Parameters) extends L2Module {
   prefetchOpt.foreach {
     _ =>
       XSPerfAccumulate(cacheParams, "sinkA_prefetch_req", io.prefetchReq.get.fire)
-      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l2", io.prefetchReq.get.bits.isBOP && io.prefetchReq.get.fire)
-      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l1", !io.prefetchReq.get.bits.isBOP && io.prefetchReq.get.fire)
+      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l2", io.prefetchReq.get.bits.fromL2 && io.prefetchReq.get.fire)
+      XSPerfAccumulate(cacheParams, "sinkA_prefetch_from_l1", !io.prefetchReq.get.bits.fromL2 && io.prefetchReq.get.fire)
   }
 
   // cycels stalled by mainpipe
diff --git a/src/main/scala/coupledL2/TopDownMonitor.scala b/src/main/scala/coupledL2/TopDownMonitor.scala
index 678f58ba8..d4dc8d37a 100644
--- a/src/main/scala/coupledL2/TopDownMonitor.scala
+++ b/src/main/scala/coupledL2/TopDownMonitor.scala
@@ -32,6 +32,7 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
     val msStatus  = Vec(banks, Vec(mshrsAll, Flipped(ValidIO(new MSHRStatus))))
     val latePF    = Vec(banks, Input(Bool()))
     val debugTopDown = new Bundle {
+      val robTrueCommit = Input(UInt(64.W))
       val robHeadPaddr = Vec(cacheParams.hartIds.length, Flipped(Valid(UInt(36.W))))
       val l2MissMatch = Vec(cacheParams.hartIds.length, Output(Bool()))
     }
@@ -114,6 +115,7 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
   val l2prefetchSent = dirResultMatchVec(
     r =>  !r.hit &&
       (r.replacerInfo.reqSource === MemReqSource.Prefetch2L2BOP.id.U ||
+       r.replacerInfo.reqSource === MemReqSource.Prefetch2L2PBOP.id.U ||
        r.replacerInfo.reqSource === MemReqSource.Prefetch2L2SMS.id.U ||
        r.replacerInfo.reqSource === MemReqSource.Prefetch2L2Stride.id.U ||
        r.replacerInfo.reqSource === MemReqSource.Prefetch2L2Stream.id.U ||
@@ -122,6 +124,9 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
   val l2prefetchSentBOP = dirResultMatchVec(
     r => !r.hit && r.replacerInfo.reqSource === MemReqSource.Prefetch2L2BOP.id.U
   )
+  val l2prefetchSentPBOP = dirResultMatchVec(
+    r => !r.hit && r.replacerInfo.reqSource === MemReqSource.Prefetch2L2PBOP.id.U
+  )
   val l2prefetchSentSMS = dirResultMatchVec(
     r => !r.hit && r.replacerInfo.reqSource === MemReqSource.Prefetch2L2SMS.id.U
   )
@@ -142,6 +147,10 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
     r => reqFromCPU(r) && r.hit &&
       r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === PfSource.BOP.id.U
   )
+  val l2prefetchUsefulPBOP = dirResultMatchVec(
+    r => reqFromCPU(r) && r.hit &&
+      r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === PfSource.PBOP.id.U
+  )
   val l2prefetchUsefulSMS = dirResultMatchVec(
     r => reqFromCPU(r) && r.hit &&
       r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === PfSource.SMS.id.U
@@ -168,81 +177,91 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracy",
     PopCount(l2prefetchUseful), PopCount(l2prefetchSent),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyBOP",
     PopCount(l2prefetchUsefulBOP), PopCount(l2prefetchSentBOP),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
+  )
+  XSPerfRolling(
+    cacheParams, "L2PrefetchAccuracyPBOP",
+    PopCount(l2prefetchUsefulPBOP), PopCount(l2prefetchSentPBOP),
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracySMS",
     PopCount(l2prefetchUsefulSMS), PopCount(l2prefetchSentSMS),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2prefetchSentTP),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyStride",
     PopCount(l2prefetchUsefulStride), PopCount(l2prefetchSentStride),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyStream",
     PopCount(l2prefetchUsefulStream), PopCount(l2prefetchSentStream),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchAccuracyTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2prefetchSentTP),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
 
   // PF Late
   XSPerfRolling(
     cacheParams, "L2PrefetchLate",
     PopCount(l2prefetchLate), PopCount(l2prefetchUseful),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
 
   // PF Coverage
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverage",
     PopCount(l2prefetchUseful), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageBOP",
     PopCount(l2prefetchUsefulBOP), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
+  )
+  XSPerfRolling(
+    cacheParams, "L2PrefetchCoveragePBOP",
+    PopCount(l2prefetchUsefulPBOP), PopCount(l2demandRequest),
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageSMS",
     PopCount(l2prefetchUsefulSMS), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageStride",
     PopCount(l2prefetchUsefulStride), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageStream",
     PopCount(l2prefetchUsefulStream), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
   XSPerfRolling(
     cacheParams, "L2PrefetchCoverageTP",
     PopCount(l2prefetchUsefulTP), PopCount(l2demandRequest),
-    1000, clock, reset
+    1000, io.debugTopDown.robTrueCommit, clock, reset
   )
 
   XSPerfAccumulate(cacheParams, "l2prefetchSent", PopCount(l2prefetchSent))
diff --git a/src/main/scala/coupledL2/prefetch/BestOffsetPrefetch.scala b/src/main/scala/coupledL2/prefetch/BestOffsetPrefetch.scala
index a8617d1bc..742bcdcd4 100644
--- a/src/main/scala/coupledL2/prefetch/BestOffsetPrefetch.scala
+++ b/src/main/scala/coupledL2/prefetch/BestOffsetPrefetch.scala
@@ -17,29 +17,41 @@
 
 package coupledL2.prefetch
 
-import utility.{MemReqSource, SRAMTemplate}
+import utility.{ChiselDB, Constantin, MemReqSource, ParallelPriorityMux, RRArbiterInit, SRAMTemplate}
 import org.chipsalliance.cde.config.Parameters
+import chisel3.DontCare.:=
 import chisel3._
 import chisel3.util._
-import coupledL2.HasCoupledL2Parameters
-import coupledL2.utils.XSPerfAccumulate
+import coupledL2.{HasCoupledL2Parameters, L2TlbReq, L2ToL1TlbIO, TlbCmd}
+import coupledL2.utils.{ReplacementPolicy, XSPerfAccumulate}
+import scopt.Read
 
 case class BOPParameters(
+  virtualTrain: Boolean = true,
   rrTableEntries: Int = 256,
   rrTagBits:      Int = 12,
   scoreBits:      Int = 5,
   roundMax:       Int = 50,
-  badScore:       Int = 1,
+  badScore:       Int = 2,
+  tlbReplayCnt:   Int = 10,
+  dQEntries: Int = 16,
+  dQLatency: Int = 175,
+  dQMaxLatency: Int = 256,
   offsetList: Seq[Int] = Seq(
-    -32, -30, -27, -25, -24, -20, -18, -16, -15,
-    -12, -10,  -9,  -8,  -6,  -5,  -4,  -3,  -2,  -1,
-      1,   2,   3,   4,   5,   6,   8,   9,  10,
-     12,  15,  16,  18,  20,  24,  25,  27,  30//,
-    /*32,  36,
-     40,  45,  48,  50,  54,  60,  64,  72,  75,  80,
-     81,  90,  96, 100, 108, 120, 125, 128, 135, 144,
-    150, 160, 162, 180, 192, 200, 216, 225, 240, 243,
-    250, 256*/
+    -256, -250, -243, -240, -225, -216, -200,
+    -192, -180, -162, -160, -150, -144, -135, -128,
+    -125, -120, -108, -100, -96, -90, -81, -80,
+    -75, -72, -64, -60, -54, -50, -48, -45,
+    -40, -36, -32, -30, -27, -25, -24, -20,
+    -18, -16, -15, -12, -10, -9, -8, -6,
+    -5, -4, -3, -2, -1,
+    1, 2, 3, 4, 5, 6, 8,
+    9, 10, 12, 15, 16, 18, 20, 24,
+    25, 27, 30, 32, 36, 40, 45, 48,
+    50, 54, 60, 64, 72, 75, 80, 81,
+    90, 96, 100, 108, 120, 125, 128, 135,
+    144, 150, 160, 162, 180, 192, 200, 216,
+    225, 240, 243, 250/*, 256*/
   ))
     extends PrefetchParameters {
   override val hasPrefetchBit:  Boolean = true
@@ -47,23 +59,34 @@ case class BOPParameters(
   override val inflightEntries: Int = 16
 }
 
-trait HasBOPParams extends HasCoupledL2Parameters {
+trait HasBOPParams extends HasPrefetcherHelper {
   val bopParams = prefetchOpt.get.asInstanceOf[BOPParameters]
+
+  // train address space: virtual or physical
+  val virtualTrain = bopParams.virtualTrain
+  val fullAddrBits = if(virtualTrain) fullVAddrBits else fullAddressBits
+  val noOffsetAddrBits = fullAddrBits - offsetBits
+  override val REQ_FILTER_SIZE = 16
+
   // Best offset
   val defaultMinAddrBits = offsetBits + log2Up(bopParams.rrTableEntries) + bopParams.rrTagBits
-  val defaultConfig = fullAddressBits >= defaultMinAddrBits
+  val defaultConfig = fullAddrBits >= defaultMinAddrBits
 
   val rrTableEntries = if (defaultConfig) bopParams.rrTableEntries else 2
   val rrIdxBits = log2Up(rrTableEntries)
-  val rrTagBits = if (defaultConfig) bopParams.rrTagBits else (fullAddressBits - offsetBits - rrIdxBits)
+  val rrTagBits = if (defaultConfig) bopParams.rrTagBits else (fullAddrBits - offsetBits - rrIdxBits)
   val scoreBits = bopParams.scoreBits
   val roundMax = bopParams.roundMax
   val badScore = bopParams.badScore
+  val initScore = bopParams.badScore + 1
   val offsetList = bopParams.offsetList
   val inflightEntries = bopParams.inflightEntries
+  val dQEntries = bopParams.dQEntries
+  val dQLatency = bopParams.dQLatency
+  val dQMaxLatency = bopParams.dQMaxLatency
 
   val scores = offsetList.length
-  val offsetWidth = log2Up(-offsetList(0)) + 1 // -32 <= offset <= 31
+  val offsetWidth = log2Up(offsetList.max) + 2 // -32 <= offset <= 31
   val roundBits = log2Up(roundMax)
   val scoreMax = (1 << scoreBits) - 1
   val scoreTableIdxBits = log2Up(scores)
@@ -95,7 +118,7 @@ class ScoreTableEntry(implicit p: Parameters) extends BOPBundle {
 
 class TestOffsetReq(implicit p: Parameters) extends BOPBundle {
   // find whether (X-d) is in recent request table
-  val addr = UInt(fullAddressBits.W)
+  val addr = UInt(fullAddrBits.W)
   val testOffset = UInt(offsetWidth.W)
   val ptr = UInt(scoreTableIdxBits.W)
 }
@@ -113,7 +136,7 @@ class TestOffsetBundle(implicit p: Parameters) extends BOPBundle {
 
 class RecentRequestTable(implicit p: Parameters) extends BOPModule {
   val io = IO(new Bundle {
-    val w = Flipped(DecoupledIO(UInt(fullAddressBits.W)))
+    val w = Flipped(DecoupledIO(UInt(fullAddrBits.W)))
     val r = Flipped(new TestOffsetBundle)
   })
 
@@ -124,7 +147,7 @@ class RecentRequestTable(implicit p: Parameters) extends BOPModule {
   //        +-------+------------------+---------------+----------------------+
   //    or: |  ...  |    12-bit tag    |  8-bit hash1  |  6-bit cache offset  |
   //        +-------+------------------+---------------+----------------------+
-  def lineAddr(addr: UInt) = addr(fullAddressBits - 1, offsetBits)
+  def lineAddr(addr: UInt) = addr(fullAddrBits - 1, offsetBits)
   def hash1(addr:    UInt) = lineAddr(addr)(rrIdxBits - 1, 0)
   def hash2(addr:    UInt) = lineAddr(addr)(2 * rrIdxBits - 1, rrIdxBits)
   def idx(addr:      UInt) = hash1(addr) ^ hash2(addr)
@@ -144,7 +167,7 @@ class RecentRequestTable(implicit p: Parameters) extends BOPModule {
   rrTable.io.w.req.bits.data(0).valid := true.B
   rrTable.io.w.req.bits.data(0).tag := tag(wAddr)
 
-  val rAddr = io.r.req.bits.addr - signedExtend((io.r.req.bits.testOffset << offsetBits), fullAddressBits)
+  val rAddr = io.r.req.bits.addr - signedExtend((io.r.req.bits.testOffset << offsetBits), fullAddrBits)
   val rData = Wire(rrTableEntry())
   rrTable.io.r.req.valid := io.r.req.fire
   rrTable.io.r.req.bits.setIdx := idx(rAddr)
@@ -160,23 +183,27 @@ class RecentRequestTable(implicit p: Parameters) extends BOPModule {
 
 }
 
-class OffsetScoreTable(implicit p: Parameters) extends BOPModule {
+class OffsetScoreTable(name: String = "")(implicit p: Parameters) extends BOPModule {
   val io = IO(new Bundle {
-    val req = Flipped(DecoupledIO(UInt(fullAddressBits.W)))
+    val req = Flipped(DecoupledIO(UInt(fullAddrBits.W)))
     val prefetchOffset = Output(UInt(offsetWidth.W))
+    val prefetchDisable = Output(Bool())
     val test = new TestOffsetBundle
   })
 
   val prefetchOffset = RegInit(2.U(offsetWidth.W))
+  val prefetchDisable = RegInit(false.B)
   // score table
   // val st = RegInit(VecInit(offsetList.map(off => (new ScoreTableEntry).apply(off.U, 0.U))))
   val st = RegInit(VecInit(Seq.fill(scores)((new ScoreTableEntry).apply(0.U))))
   val offList = WireInit(VecInit(offsetList.map(off => off.S(offsetWidth.W).asUInt)))
   val ptr = RegInit(0.U(scoreTableIdxBits.W))
   val round = RegInit(0.U(roundBits.W))
-
+  
+  val badscoreConstant = WireInit(Constantin.createRecord(name+"BadScore", bopParams.badScore.U))
+  val initscoreConstant = WireInit(Constantin.createRecord(name+"InitScore", (bopParams.badScore+1).U))
   val bestOffset = RegInit(2.U(offsetWidth.W)) // the entry with the highest score while traversing
-  val bestScore = RegInit(badScore.U(scoreBits.W))
+  val bestScore = RegInit(10.U)
   val testOffset = offList(ptr)
   // def winner(e1: ScoreTableEntry, e2: ScoreTableEntry): ScoreTableEntry = {
   //   val w = Wire(new ScoreTableEntry)
@@ -190,12 +217,14 @@ class OffsetScoreTable(implicit p: Parameters) extends BOPModule {
   // 1. At the start of a learning phase
   // All the scores are reset to 0.
   // At the end of every learning phase, the prefetch offset is updated as the one with the highest score.
+  val isBad = bestScore < badscoreConstant
   when(state === s_idle) {
     st.foreach(_.score := 0.U)
     ptr := 0.U
     round := 0.U
-    bestScore := badScore.U
+    bestScore := 0.U
     prefetchOffset := bestOffset
+    prefetchDisable := isBad
     state := s_learn
   }
 
@@ -237,12 +266,17 @@ class OffsetScoreTable(implicit p: Parameters) extends BOPModule {
 
   io.req.ready := state === s_learn
   io.prefetchOffset := prefetchOffset
+  io.prefetchDisable := prefetchDisable
   io.test.req.valid := state === s_learn && io.req.valid
   io.test.req.bits.addr := io.req.bits
   io.test.req.bits.testOffset := testOffset
   io.test.req.bits.ptr := ptr
   io.test.resp.ready := true.B
 
+  XSPerfAccumulate(cacheParams, "total_learn_phase", state === s_idle)
+  XSPerfAccumulate(cacheParams, "total_bop_disable", state === s_idle && isBad)
+  XSPerfAccumulate(cacheParams, "total_bop_high_confidence", state === s_idle && bestScore === scoreMax.U)
+
   for (off <- offsetList) {
     if (off < 0) {
       XSPerfAccumulate(cacheParams, "best_offset_neg_" + (-off).toString + "_learning_phases",
@@ -253,25 +287,489 @@ class OffsetScoreTable(implicit p: Parameters) extends BOPModule {
     }
   }
 
+  // FIXME lyq: remove the db
+  class BopTrainEntry extends Bundle {
+    val bestOffset = UInt(offsetWidth.W)
+    val bestScore = UInt(scoreBits.W)
+  }
+
+  val l2BopTrainTable = ChiselDB.createTable("L2BopTrainTable", new BopTrainEntry, basicDB = true)
+  for (i <- 0 until REQ_FILTER_SIZE) {
+    val data = Wire(new BopTrainEntry)
+    data.bestOffset := bestOffset
+    data.bestScore := bestScore
+    // l2BopTrainTable.log(data = data, en = (state === s_idle) && !isBad, site = name+"OffsetScoreTable", clock, reset)
+    l2BopTrainTable.log(data = data, en = (state === s_idle) && !isBad, site = name+"OffsetScoreTable", clock, reset)
+  }
+
+}
+
+class BopReqBundle(implicit p: Parameters) extends BOPBundle{
+  val full_vaddr = UInt(fullVAddrBits.W)
+  val base_vaddr = UInt(vaddrBitsOpt.getOrElse(0).W)
+  val needT = Bool()
+  val source = UInt(sourceIdBits.W)
+  val isBOP = Bool()
+}
+
+class BopReqBufferEntry(implicit p: Parameters) extends BOPBundle {
+  val valid = Bool()
+  // for tlb req
+  val paddrValid = Bool()
+  val vaddrNoOffset = UInt((fullVAddrBits-offsetBits).W)
+  val baseVaddr = UInt((fullVAddrBits-offsetBits).W)
+  val paddrNoOffset = UInt(fullVAddrBits.W)
+  val replayEn = Bool()
+  val replayCnt = UInt(4.W)
+  // for pf req
+  val needT = Bool()
+  val source = UInt(sourceIdBits.W)
+
+  def reset(x: UInt): Unit = {
+    valid := false.B
+    paddrValid := false.B
+    vaddrNoOffset := 0.U
+    baseVaddr := 0.U
+    paddrNoOffset := 0.U
+    replayEn := false.B
+    replayCnt := 0.U
+    needT := false.B
+    source := 0.U
+  }
+
+  def fromBopReqBundle(req: BopReqBundle) = {
+    valid := true.B
+    paddrValid := false.B
+    vaddrNoOffset := get_block_vaddr(req.full_vaddr)
+    baseVaddr := req.base_vaddr
+    replayEn := false.B
+    replayCnt := 0.U
+    paddrNoOffset := 0.U
+    needT := req.needT
+    source := req.source
+  }
+
+  def isEqualBopReq(req: BopReqBundle) = {
+    // FIXME lyq: the comparision logic is very complicated, is there a way to simplify
+    valid &&
+    vaddrNoOffset === get_block_vaddr(req.full_vaddr) &&
+    baseVaddr === req.base_vaddr &&
+    needT === req.needT &&
+    source === req.source
+  }
+
+  def toPrefetchReq(): PrefetchReq = {
+    val req = Wire(new PrefetchReq)
+    req.tag := parseFullAddress(get_pf_paddr())._1
+    req.set := parseFullAddress(get_pf_paddr())._2
+    req.vaddr.foreach(_ := baseVaddr)
+    req.needT := needT
+    req.source := source
+    req.pfSource := MemReqSource.Prefetch2L2BOP.id.U
+    req
+  }
+
+  def can_send_pf(): Bool = {
+    valid && paddrValid
+  }
+
+  def get_pf_paddr(): UInt = {
+    Cat(paddrNoOffset, 0.U(offsetBits.W))
+  }
+
+  def get_tlb_vaddr(): UInt = {
+    Cat(vaddrNoOffset, 0.U(offsetBits.W))
+  }
+
+  def update_paddr(paddr: UInt) = {
+    paddrValid := true.B
+    paddrNoOffset := paddr(paddr.getWidth-1, offsetBits)
+    replayEn := false.B
+    replayCnt := 0.U
+  }
+
+  def update_sent(): Unit ={
+    valid := false.B
+  }
+
+  def update_excp(): Unit = {
+    valid := false.B
+  }
+}
+
+class PrefetchReqBuffer(implicit p: Parameters) extends BOPModule{
+  val io = IO(new Bundle() {
+    val in_req = Flipped(ValidIO(new BopReqBundle))
+    val tlb_req = new L2ToL1TlbIO(nRespDups = 1)
+    val out_req = DecoupledIO(new PrefetchReq)
+  })
+
+  val firstTlbReplayCnt = WireInit(Constantin.createRecord("firstTlbReplayCnt", bopParams.tlbReplayCnt.U))
+
+  def wayMap[T <: Data](f: Int => T) = VecInit((0 until REQ_FILTER_SIZE).map(f))
+  def get_flag(vaddr: UInt) = get_block_vaddr(vaddr)
+
+  // if full then drop new req, so there is no need to use s1_evicted_oh & replacement
+  val entries = Seq.fill(REQ_FILTER_SIZE)(Reg(new BopReqBufferEntry))
+  //val replacement = ReplacementPolicy.fromString("plru", REQ_FILTER_SIZE)
+  val tlb_req_arb = Module(new RRArbiterInit(new L2TlbReq, REQ_FILTER_SIZE))
+  val pf_req_arb = Module(new RRArbiterInit(new PrefetchReq, REQ_FILTER_SIZE))
+
+  io.tlb_req.req <> tlb_req_arb.io.out
+  io.tlb_req.req_kill := false.B
+  io.tlb_req.resp.ready := true.B
+  io.out_req <> pf_req_arb.io.out
+
+  /* s0: entries look up */
+  val prev_in_valid = RegNext(io.in_req.valid, false.B)
+  val prev_in_req = RegEnable(io.in_req.bits, io.in_req.valid)
+  val prev_in_flag = get_flag(prev_in_req.full_vaddr)
+  // s1 entry update
+  val alloc = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+
+  val s0_in_req = io.in_req.bits
+  val s0_in_flag = get_flag(s0_in_req.full_vaddr)
+  val s0_conflict_prev = prev_in_valid && s0_in_flag === prev_in_flag
+  // FIXME lyq: the comparision logic is very complicated, is there a way to simplify
+  val s0_match_oh = VecInit(entries.indices.map(i =>
+    entries(i).valid && entries(i).vaddrNoOffset === s0_in_flag &&
+    entries(i).needT === s0_in_req.needT && entries(i).source === s0_in_req.source &&
+    entries(i).baseVaddr === s0_in_req.base_vaddr
+  )).asUInt
+  val s0_match = Cat(s0_match_oh).orR
+
+  val s0_invalid_vec = wayMap(w => !entries(w).valid && !alloc(w))
+  val s0_has_invalid_way = s0_invalid_vec.asUInt.orR
+  val s0_invalid_oh = ParallelPriorityMux(s0_invalid_vec.zipWithIndex.map(x => x._1 -> UIntToOH(x._2.U(REQ_FILTER_SIZE.W))))
+
+  val s0_req_valid = io.in_req.valid && !s0_conflict_prev && !s0_match && s0_has_invalid_way
+  val s0_tlb_fire_oh = VecInit(tlb_req_arb.io.in.map(_.fire)).asUInt
+  val s0_pf_fire_oh = VecInit(pf_req_arb.io.in.map(_.fire)).asUInt
+  //val s0_access_way = Mux(s0_match, OHToUInt(s0_match_oh), OHToUInt(s0_replace_oh))
+  //when(s0_req_valid){
+  //  replacement.access(s0_access_way)
+  //}
+  XSPerfAccumulate(cacheParams, "recv_req", io.in_req.valid)
+  XSPerfAccumulate(cacheParams, "recv_req_drop_conflict", io.in_req.valid && s0_conflict_prev)
+  XSPerfAccumulate(cacheParams, "recv_req_drop_match", io.in_req.valid && !s0_conflict_prev && s0_match)
+  XSPerfAccumulate(cacheParams, "recv_req_drop_full", io.in_req.valid && !s0_conflict_prev && !s0_match && !s0_has_invalid_way)
+
+
+  /* s1 update and replace */
+  val s1_valid = RegNext(s0_req_valid, false.B)
+  val s1_in_req = RegEnable(s0_in_req, s0_req_valid)
+  val s1_invalid_oh = RegEnable(s0_invalid_oh, 0.U, s0_req_valid)
+  val s1_pf_fire_oh = RegNext(s0_pf_fire_oh, 0.U)
+  val s1_tlb_fire_oh = RegNext(s0_tlb_fire_oh, 0.U)
+  val s1_alloc_entry = Wire(new BopReqBufferEntry)
+  s1_alloc_entry.fromBopReqBundle(s1_in_req)
+
+  /* entry update */
+  val exp_drop = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  val miss_drop = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  val miss_first_replay = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  val pf_fired = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  val tlb_fired = Wire(Vec(REQ_FILTER_SIZE, Bool()))
+  for ((e, i) <- entries.zipWithIndex){
+    alloc(i) := s1_valid && s1_invalid_oh(i)
+    pf_fired(i) := s0_pf_fire_oh(i)
+    exp_drop(i) := s1_tlb_fire_oh(i) && io.tlb_req.resp.valid && !io.tlb_req.resp.bits.miss &&
+      ((e.needT && (io.tlb_req.resp.bits.excp.head.pf.st || io.tlb_req.resp.bits.excp.head.af.st)) ||
+      (!e.needT && (io.tlb_req.resp.bits.excp.head.pf.ld || io.tlb_req.resp.bits.excp.head.af.ld)))
+    val miss = s1_tlb_fire_oh(i) && io.tlb_req.resp.valid && io.tlb_req.resp.bits.miss
+    tlb_fired(i) := s1_tlb_fire_oh(i) && io.tlb_req.resp.valid && !io.tlb_req.resp.bits.miss && !exp_drop(i)
+    miss_drop(i) := miss && e.replayEn
+    miss_first_replay(i) := miss && !e.replayEn
+    
+    // old data: update replayCnt
+    when(e.valid && e.replayCnt.orR) {
+      e.replayCnt := e.replayCnt - 1.U
+    }
+    // recent data: update tlb resp
+    when(tlb_fired(i)){
+      e.update_paddr(io.tlb_req.resp.bits.paddr.head)
+    }.elsewhen(miss_drop(i)) { // miss
+      e.reset(i.U)
+    }.elsewhen(miss_first_replay(i)){
+      e.replayCnt := firstTlbReplayCnt
+      e.replayEn := 1.U
+    }.elsewhen(exp_drop(i)){
+      e.update_excp()
+    }
+    // issue data: update pf
+    when(pf_fired(i)){
+      e.update_sent()
+    }
+    // new data: update data
+    when(alloc(i)){
+      e := s1_alloc_entry
+    }
+  }
+
+  /* tlb & pf */
+  for((e, i) <- entries.zipWithIndex){
+    //tlb_req_arb.io.in(i).valid := e.valid && !s1_tlb_fire_oh(i) && !s2_tlb_fire_oh(i) && !e.paddrValid && !s1_evicted_oh(i)
+    tlb_req_arb.io.in(i).valid := e.valid && !e.paddrValid && !s1_tlb_fire_oh(i) && !e.replayCnt.orR
+    tlb_req_arb.io.in(i).bits.vaddr := e.get_tlb_vaddr()
+    when(e.needT) {
+      tlb_req_arb.io.in(i).bits.cmd := TlbCmd.write
+    }.otherwise{
+      tlb_req_arb.io.in(i).bits.cmd := TlbCmd.read
+    }
+    tlb_req_arb.io.in(i).bits.size := 3.U
+    tlb_req_arb.io.in(i).bits.kill := false.B
+    tlb_req_arb.io.in(i).bits.no_translate := false.B
+
+    pf_req_arb.io.in(i).valid := e.can_send_pf()
+    pf_req_arb.io.in(i).bits := e.toPrefetchReq()
+  }
+
+  // reset meta to avoid muti-hit problem
+  for (i <- 0 until REQ_FILTER_SIZE) {
+    when(reset.asBool) {
+      entries(i).reset(i.U)
+    }
+  }
+
+  XSPerfAccumulate(cacheParams, "tlb_req", io.tlb_req.req.valid)
+  XSPerfAccumulate(cacheParams, "tlb_miss", io.tlb_req.resp.valid && io.tlb_req.resp.bits.miss)
+  XSPerfAccumulate(cacheParams, "tlb_excp",
+    io.tlb_req.resp.valid && !io.tlb_req.resp.bits.miss && (
+      io.tlb_req.resp.bits.excp.head.pf.st || io.tlb_req.resp.bits.excp.head.af.st ||
+      io.tlb_req.resp.bits.excp.head.pf.ld || io.tlb_req.resp.bits.excp.head.af.ld
+  ))
+  XSPerfAccumulate(cacheParams, "entry_alloc", PopCount(alloc))
+  XSPerfAccumulate(cacheParams, "entry_miss_first_replay", PopCount(miss_first_replay))
+  XSPerfAccumulate(cacheParams, "entry_miss_drop", PopCount(miss_drop))
+  XSPerfAccumulate(cacheParams, "entry_excp", PopCount(exp_drop))
+  XSPerfAccumulate(cacheParams, "entry_merge", io.in_req.valid && s0_match)
+  XSPerfAccumulate(cacheParams, "entry_pf_fire", PopCount(pf_fired))
+  
+  /*
+  val enTalbe = WireInit(Constantin.createRecord("isWriteL2BopTable", 1.U))
+  val l2BOPTable = ChiselDB. createTable("L2BOPTable", new BopReqBufferEntry, basicDB = true)
+  for (i <- 0 until REQ_FILTER_SIZE){
+    when(alloc(i)){
+      l2BOPTable.log(
+        data = entries(i),
+        en = enTalbe.orR && pf_fired(i),
+        site = "L2BOPTable",
+        clock = clock,
+        reset = reset
+      )
+    }
+  }
+  */
+}
+
+class DelayQueue(name: String = "")(implicit p: Parameters) extends  BOPModule{
+  val io = IO(new Bundle(){
+    val in = Flipped(DecoupledIO(UInt(noOffsetAddrBits.W)))
+    val out = DecoupledIO(UInt(fullAddrBits.W))
+    // only record `fullAddrBits - offsetBits` bits
+    // out.bits = Cat(record, 0.U(offsetBits))
+  })
+
+  /* Setting */
+  val IdxWidth = log2Up(dQEntries)
+  val LatencyWidth = log2Up(dQMaxLatency)
+  class Entry extends Bundle{
+    val addrNoOffset = UInt(noOffsetAddrBits.W)
+    val cnt = UInt(LatencyWidth.W)
+  }
+  val queue = RegInit(VecInit(Seq.fill(dQEntries)(0.U.asTypeOf(new Entry))))
+  val valids = RegInit(VecInit(Seq.fill(dQEntries)(false.B)))
+  val head = RegInit(0.U(IdxWidth.W))
+  val tail = RegInit(0.U(IdxWidth.W))
+  val empty = head === tail && !valids.last
+  val full = head === tail && valids.last
+  val outValid = !empty && !queue(head).cnt.orR && valids(head)
+
+  /* In & Out */
+  var setDqLatency = WireInit(Constantin.createRecord("DelayQueueLatency"+name, dQLatency.U))
+  when(io.in.valid && !full) {
+    // if queue is full, we drop the new request
+    queue(tail).addrNoOffset := io.in.bits
+    queue(tail).cnt := setDqLatency // dQLatency.U
+    valids(tail) := true.B
+    tail := tail + 1.U
+
+    /*
+    // if full, drop the old request
+    when(full && !io.deq.ready) {
+      head := head + 1.U
+    }
+    */
+  }
+  when(outValid && io.out.ready) {
+    valids(head) := false.B
+    head := head + 1.U
+  }
+  io.in.ready := true.B
+  io.out.valid := outValid
+  io.out.bits := Cat(queue(head).addrNoOffset, 0.U(offsetBits.W))
+
+  /* Update */
+  for(i <- 0 until dQEntries){
+    when(queue(i).cnt.orR){
+      queue(i).cnt := queue(i).cnt - 1.U
+    }
+  }
+
+  /* Perf */
+  XSPerfAccumulate(cacheParams, "full", full)
+  XSPerfAccumulate(cacheParams, "empty", empty)
+  XSPerfAccumulate(cacheParams, "entryNumber", PopCount(valids.asUInt))
+  XSPerfAccumulate(cacheParams, "inNumber", io.in.valid)
+  XSPerfAccumulate(cacheParams, "outNumber", io.out.valid)
+
+}
+
+class VBestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
+  val io = IO(new Bundle() {
+    val train = Flipped(DecoupledIO(new PrefetchTrain))
+    val pbopCrossPage = Input(Bool())
+    val tlb_req = new L2ToL1TlbIO(nRespDups= 1)
+    val req = DecoupledIO(new PrefetchReq)
+    val resp = Flipped(DecoupledIO(new PrefetchResp))
+  })
+
+  val delayQueue = Module(new DelayQueue("vbop"))
+  val rrTable = Module(new RecentRequestTable)
+  val scoreTable = Module(new OffsetScoreTable("vbop"))
+
+  val s0_fire = scoreTable.io.req.fire && io.pbopCrossPage
+  val s1_fire = WireInit(false.B)
+  val s0_ready, s1_ready = WireInit(false.B)
+
+  /* s0 train */
+  val prefetchOffset = scoreTable.io.prefetchOffset
+  val prefetchDisable = scoreTable.io.prefetchDisable
+  // NOTE: vaddr from l1 to l2 has no offset bits
+  val s0_reqVaddr = io.train.bits.vaddr.getOrElse(0.U)
+  val s0_oldFullAddr = if(virtualTrain) Cat(io.train.bits.vaddr.getOrElse(0.U), 0.U(offsetBits.W)) else io.train.bits.addr
+  val s0_oldFullAddrNoOff = s0_oldFullAddr(s0_oldFullAddr.getWidth-1, offsetBits)
+  val s0_newFullAddr = s0_oldFullAddr + signedExtend((prefetchOffset << offsetBits), fullAddrBits)
+  val s0_crossPage = getPPN(s0_newFullAddr) =/= getPPN(s0_oldFullAddr) // unequal tags
+  val respFullAddr = if(virtualTrain) Cat(io.resp.bits.vaddr.getOrElse(0.U), 0.U(offsetBits.W))
+                 else io.resp.bits.addr - signedExtend((prefetchOffset << offsetBits), fullAddrBits)
+
+  rrTable.io.r <> scoreTable.io.test
+  rrTable.io.w <> delayQueue.io.out
+  delayQueue.io.in.valid := io.train.valid
+  delayQueue.io.in.bits := s0_oldFullAddrNoOff
+  scoreTable.io.req.valid := io.train.valid
+  scoreTable.io.req.bits := s0_oldFullAddr
+
+  /* s1 get or send req */
+  val s1_req_valid = RegInit(false.B)
+  val s1_needT = RegEnable(io.train.bits.needT, s0_fire)
+  val s1_source = RegEnable(io.train.bits.source, s0_fire)
+  val s1_newFullAddr = RegEnable(s0_newFullAddr, s0_fire)
+  val s1_reqVaddr = RegEnable(s0_reqVaddr, s0_fire)
+  // val out_req = Wire(new PrefetchReq)
+  // val out_req_valid = Wire(Bool())
+  // val out_drop_req = WireInit(false.B)
+
+  // pipeline control signal
+  when(s0_fire) {
+    if(virtualTrain) s1_req_valid := true.B
+    else s1_req_valid := !s0_crossPage // stop prefetch when prefetch req crosses pages
+  }.elsewhen(s1_fire){
+    s1_req_valid := false.B
+  }
+
+  if (virtualTrain) {
+    // FIXME lyq: it it not correct
+    s0_ready := io.tlb_req.req.ready && s1_ready || !s1_req_valid
+    s1_ready := io.req.ready || !io.req.valid
+    s1_fire := s1_ready && s1_req_valid
+  } else {
+    s0_ready := io.req.ready || !io.req.valid
+    s1_ready := io.req.ready
+    s1_fire := io.req.fire
+  }
+
+  // out value
+  io.train.ready := delayQueue.io.in.ready && scoreTable.io.req.ready && s0_ready
+  io.resp.ready := rrTable.io.w.ready
+  io.tlb_req.resp.ready := true.B
+
+  // different situation
+  val reqFilter = Module(new PrefetchReqBuffer)
+  when(prefetchDisable || !virtualTrain.B){
+    reqFilter.io.in_req.valid := false.B
+    reqFilter.io.in_req.bits := DontCare
+  }.otherwise{
+    reqFilter.io.in_req.valid := s1_req_valid
+    reqFilter.io.in_req.bits.full_vaddr := s1_newFullAddr
+    reqFilter.io.in_req.bits.base_vaddr := s1_reqVaddr
+    reqFilter.io.in_req.bits.needT := s1_needT
+    reqFilter.io.in_req.bits.source := s1_source
+    reqFilter.io.in_req.bits.isBOP := true.B
+  }
+
+  if(virtualTrain){
+    io.tlb_req <> reqFilter.io.tlb_req
+    io.req <> reqFilter.io.out_req
+  } else {
+    io.tlb_req.req.valid := false.B
+    io.tlb_req.req.bits := DontCare
+    io.tlb_req.req_kill := false.B
+
+    /* s1 send prefetch req */
+    io.req.valid := s1_req_valid
+    io.req.bits.tag := parseFullAddress(s1_newFullAddr)._1
+    io.req.bits.set := parseFullAddress(s1_newFullAddr)._2
+    io.req.bits.vaddr.foreach(_ := 0.U)
+    io.req.bits.needT := s1_needT
+    io.req.bits.source := s1_source
+    io.req.bits.pfSource := MemReqSource.Prefetch2L2BOP.id.U
+    io.req.bits.isBOP := true.B
+  }
+
+  for (off <- offsetList) {
+    if (off < 0) {
+      XSPerfAccumulate(cacheParams, "best_offset_neg_" + (-off).toString, prefetchOffset === off.S(offsetWidth.W).asUInt)
+    } else {
+      XSPerfAccumulate(cacheParams, "best_offset_pos_" + off.toString, prefetchOffset === off.U)
+    }
+  }
+  XSPerfAccumulate(cacheParams, "bop_req", io.req.fire)
+  XSPerfAccumulate(cacheParams, "bop_train", io.train.fire)
+  XSPerfAccumulate(cacheParams, "bop_resp", io.resp.fire)
+  XSPerfAccumulate(cacheParams, "bop_train_stall_for_st_not_ready", io.train.valid && !scoreTable.io.req.ready)
+  if(virtualTrain){
+    XSPerfAccumulate(cacheParams, "bop_train_stall_for_tlb_not_ready", io.train.valid && !io.tlb_req.req.ready)
+    // XSPerfAccumulate(cacheParams, "bop_req_drop", out_drop_req)
+  }else{
+    XSPerfAccumulate(cacheParams, "bop_cross_page", scoreTable.io.req.fire && s0_crossPage)
+  }
+  XSPerfAccumulate(cacheParams, "bop_drop_for_disable", scoreTable.io.req.fire && prefetchDisable)
 }
 
-class BestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
+class PBestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
   val io = IO(new Bundle() {
     val train = Flipped(DecoupledIO(new PrefetchTrain))
+    val pbopCrossPage = Output(Bool())
     val req = DecoupledIO(new PrefetchReq)
     val resp = Flipped(DecoupledIO(new PrefetchResp))
   })
 
+  val delayQueue = Module(new DelayQueue("pbop"))
   val rrTable = Module(new RecentRequestTable)
-  val scoreTable = Module(new OffsetScoreTable)
+  val scoreTable = Module(new OffsetScoreTable("pbop"))
 
   val prefetchOffset = scoreTable.io.prefetchOffset
+  val prefetchDisable = scoreTable.io.prefetchDisable
   val oldAddr = io.train.bits.addr
+  val oldAddrNoOff = oldAddr(oldAddr.getWidth-1, offsetBits)
   val newAddr = oldAddr + signedExtend((prefetchOffset << offsetBits), fullAddressBits)
 
   rrTable.io.r <> scoreTable.io.test
-  rrTable.io.w.valid := io.resp.valid
-  rrTable.io.w.bits := Cat(Cat(io.resp.bits.tag, io.resp.bits.set) - signedExtend(prefetchOffset, setBits + fullTagBits), 0.U(offsetBits.W))
+  rrTable.io.w <> delayQueue.io.out
+  delayQueue.io.in.valid := io.train.valid
+  delayQueue.io.in.bits := oldAddrNoOff
   scoreTable.io.req.valid := io.train.valid
   scoreTable.io.req.bits := oldAddr
 
@@ -286,13 +784,14 @@ class BestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
     req.set := parseFullAddress(newAddr)._2
     req.needT := io.train.bits.needT
     req.source := io.train.bits.source
-    req_valid := !crossPage // stop prefetch when prefetch req crosses pages
+    req_valid := !crossPage && !prefetchDisable // stop prefetch when prefetch req crosses pages
   }
 
+  io.pbopCrossPage := crossPage
   io.req.valid := req_valid
   io.req.bits := req
-  io.req.bits.pfSource := MemReqSource.Prefetch2L2BOP.id.U
-  io.train.ready := scoreTable.io.req.ready && (!req_valid || io.req.ready)
+  io.req.bits.pfSource := MemReqSource.Prefetch2L2PBOP.id.U
+  io.train.ready := delayQueue.io.in.ready && scoreTable.io.req.ready && (!req_valid || io.req.ready)
   io.resp.ready := rrTable.io.w.ready
 
   for (off <- offsetList) {
@@ -304,6 +803,8 @@ class BestOffsetPrefetch(implicit p: Parameters) extends BOPModule {
   }
   XSPerfAccumulate(cacheParams, "bop_req", io.req.fire)
   XSPerfAccumulate(cacheParams, "bop_train", io.train.fire)
+  XSPerfAccumulate(cacheParams, "bop_resp", io.resp.fire)
   XSPerfAccumulate(cacheParams, "bop_train_stall_for_st_not_ready", io.train.valid && !scoreTable.io.req.ready)
-  XSPerfAccumulate(cacheParams, "bop_cross_page", scoreTable.io.req.fire && crossPage)
-}
+  XSPerfAccumulate(cacheParams, "bop_drop_for_cross_page", scoreTable.io.req.fire && crossPage)
+  XSPerfAccumulate(cacheParams, "bop_drop_for_disable", scoreTable.io.req.fire && prefetchDisable)
+}
\ No newline at end of file
diff --git a/src/main/scala/coupledL2/prefetch/PrefetchParameters.scala b/src/main/scala/coupledL2/prefetch/PrefetchParameters.scala
index 6490cb2ef..b1610ca20 100644
--- a/src/main/scala/coupledL2/prefetch/PrefetchParameters.scala
+++ b/src/main/scala/coupledL2/prefetch/PrefetchParameters.scala
@@ -40,6 +40,7 @@ object PfSource extends Enumeration {
   val NoWhere = Value("NoWhere")
   val SMS     = Value("SMS")
   val BOP     = Value("BOP")
+  val PBOP     = Value("PBOP")
   val Stream  = Value("Stream")
   val Stride  = Value("Stride")
   val TP      = Value("TP")
@@ -51,6 +52,7 @@ object PfSource extends Enumeration {
     val pfsrc = WireInit(NoWhere.id.U.asTypeOf(UInt(pfSourceBits.W)))
     switch(s) {
       is (MemReqSource.Prefetch2L2BOP.id.U) { pfsrc := BOP.id.U }
+      is (MemReqSource.Prefetch2L2PBOP.id.U) { pfsrc := PBOP.id.U }
       is (MemReqSource.Prefetch2L2SMS.id.U) { pfsrc := SMS.id.U }
       is (MemReqSource.Prefetch2L2TP.id.U)  { pfsrc := TP.id.U  }
       is (MemReqSource.Prefetch2L2Stream.id.U) { pfsrc := Stream.id.U }
diff --git a/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala b/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala
index d0ff76c5c..cda0052c4 100644
--- a/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala
+++ b/src/main/scala/coupledL2/prefetch/PrefetchReceiver.scala
@@ -42,9 +42,14 @@ class PrefetchReceiver()(implicit p: Parameters) extends PrefetchModule {
 
   io.req.bits.tag := parseFullAddress(io.recv_addr.bits.addr)._1
   io.req.bits.set := parseFullAddress(io.recv_addr.bits.addr)._2
+  io.req.bits.vaddr.foreach(_ := 0.U)
   io.req.bits.needT := false.B
   io.req.bits.source := 0.U // TODO: ensure source 0 is dcache
   io.req.bits.pfSource := io.recv_addr.bits.pfSource
   io.req.valid := io.recv_addr.valid
 
+  io.tlb_req.req.valid := false.B
+  io.tlb_req.req.bits := DontCare
+  io.tlb_req.req_kill := DontCare
+  io.tlb_req.resp.ready := true.B
 }
diff --git a/src/main/scala/coupledL2/prefetch/Prefetcher.scala b/src/main/scala/coupledL2/prefetch/Prefetcher.scala
index 3b098929c..0d4c2819f 100644
--- a/src/main/scala/coupledL2/prefetch/Prefetcher.scala
+++ b/src/main/scala/coupledL2/prefetch/Prefetcher.scala
@@ -1,211 +1,417 @@
-/** *************************************************************************************
- * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
- * Copyright (c) 2020-2021 Peng Cheng Laboratory
- *
- * XiangShan is licensed under Mulan PSL v2.
- * You can use this software according to the terms and conditions of the Mulan PSL v2.
- * You may obtain a copy of Mulan PSL v2 at:
- * http://license.coscl.org.cn/MulanPSL2
- *
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
- * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
- * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
- *
- * See the Mulan PSL v2 for more details.
- * *************************************************************************************
- */
-
-package coupledL2.prefetch
-
-import chisel3._
-import chisel3.util._
-import utility._
-import org.chipsalliance.cde.config.Parameters
-import freechips.rocketchip.tilelink._
-import coupledL2._
-import coupledL2.utils.{XSPerfAccumulate, XSPerfHistogram}
-
-class PrefetchReq(implicit p: Parameters) extends PrefetchBundle {
-  val tag = UInt(fullTagBits.W)
-  val set = UInt(setBits.W)
-  val needT = Bool()
-  val source = UInt(sourceIdBits.W)
-  val pfSource = UInt(MemReqSource.reqSourceBits.W)
-
-  def isBOP:Bool = pfSource === MemReqSource.Prefetch2L2BOP.id.U
-  def isSMS:Bool = pfSource === MemReqSource.Prefetch2L2SMS.id.U
-  def isTP:Bool = pfSource === MemReqSource.Prefetch2L2TP.id.U
-  def fromL2:Bool =
-    pfSource === MemReqSource.Prefetch2L2BOP.id.U ||
-    pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
-    pfSource === MemReqSource.Prefetch2L2TP.id.U
-}
-
-class PrefetchResp(implicit p: Parameters) extends PrefetchBundle {
-  // val id = UInt(sourceIdBits.W)
-  val tag = UInt(fullTagBits.W)
-  val set = UInt(setBits.W)
-  def addr = Cat(tag, set, 0.U(offsetBits.W))
-}
-
-class PrefetchTrain(implicit p: Parameters) extends PrefetchBundle {
-  val tag = UInt(fullTagBits.W)
-  val set = UInt(setBits.W)
-  val needT = Bool()
-  val source = UInt(sourceIdBits.W)
-  val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
-  val hit = Bool()
-  val prefetched = Bool()
-  val pfsource = UInt(PfSource.pfSourceBits.W)
-  val reqsource = UInt(MemReqSource.reqSourceBits.W)
-
-  def addr: UInt = Cat(tag, set, 0.U(offsetBits.W))
-}
-
-class PrefetchIO(implicit p: Parameters) extends PrefetchBundle {
-  val train = Flipped(DecoupledIO(new PrefetchTrain))
-  val req = DecoupledIO(new PrefetchReq)
-  val resp = Flipped(DecoupledIO(new PrefetchResp))
-  val recv_addr = Flipped(ValidIO(new Bundle() {
-    val addr = UInt(64.W)
-    val pfSource = UInt(MemReqSource.reqSourceBits.W)
-  }))
-}
-
-class PrefetchQueue(implicit p: Parameters) extends PrefetchModule {
-  val io = IO(new Bundle {
-    val enq = Flipped(DecoupledIO(new PrefetchReq))
-    val deq = DecoupledIO(new PrefetchReq)
-  })
-  /*  Here we implement a queue that
-   *  1. is pipelined  2. flows
-   *  3. always has the latest reqs, which means the queue is always ready for enq and deserting the eldest ones
-   */
-  val queue = RegInit(VecInit(Seq.fill(inflightEntries)(0.U.asTypeOf(new PrefetchReq))))
-  val valids = RegInit(VecInit(Seq.fill(inflightEntries)(false.B)))
-  val idxWidth = log2Up(inflightEntries)
-  val head = RegInit(0.U(idxWidth.W))
-  val tail = RegInit(0.U(idxWidth.W))
-  val empty = head === tail && !valids.last
-  val full = head === tail && valids.last
-
-  when(!empty && io.deq.ready) {
-    valids(head) := false.B
-    head := head + 1.U
-  }
-
-  when(io.enq.valid) {
-    queue(tail) := io.enq.bits
-    valids(tail) := !empty || !io.deq.ready // true.B
-    tail := tail + (!empty || !io.deq.ready).asUInt
-    when(full && !io.deq.ready) {
-      head := head + 1.U
-    }
-  }
-
-  io.enq.ready := true.B
-  io.deq.valid := !empty || io.enq.valid
-  io.deq.bits := Mux(empty, io.enq.bits, queue(head))
-
-  // The reqs that are discarded = enq - deq
-  XSPerfAccumulate(cacheParams, "prefetch_queue_enq",         io.enq.fire)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromBOP", io.enq.fire && io.enq.bits.isBOP)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromSMS", io.enq.fire && io.enq.bits.isSMS)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromTP",  io.enq.fire && io.enq.bits.isTP)
-
-  XSPerfAccumulate(cacheParams, "prefetch_queue_deq",         io.deq.fire)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromBOP", io.deq.fire && io.deq.bits.isBOP)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromSMS", io.deq.fire && io.deq.bits.isSMS)
-  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromTP",  io.deq.fire && io.deq.bits.isTP)
-
-  XSPerfHistogram(cacheParams, "prefetch_queue_entry", PopCount(valids.asUInt),
-    true.B, 0, inflightEntries, 1)
-}
-
-class Prefetcher(implicit p: Parameters) extends PrefetchModule {
-  val io = IO(new PrefetchIO)
-  val tpio = IO(new Bundle() {
-    val tpmeta_port = prefetchOpt match {
-      case Some(param: PrefetchReceiverParams) =>
-        if (param.hasTPPrefetcher) Some(new tpmetaPortIO()) else None
-      case _ => None
-    }
-  })
-  val hartId = IO(Input(UInt(hartIdLen.W)))
-
-  /* io_l2_pf_en:
-   * chicken bits for whether L2 prefetchers are enabled
-   * it will control BOP and TP prefetchers
-   */
-  val io_l2_pf_en = IO(Input(Bool()))
-
-  prefetchOpt.get match {
-    case bop: BOPParameters =>
-      val pft = Module(new BestOffsetPrefetch)
-      val pftQueue = Module(new PrefetchQueue)
-      val pipe = Module(new Pipeline(io.req.bits.cloneType, 1))
-      pft.io.train <> io.train
-      pft.io.resp <> io.resp
-      pftQueue.io.enq <> pft.io.req
-      pipe.io.in <> pftQueue.io.deq
-      io.req <> pipe.io.out
-    case receiver: PrefetchReceiverParams =>
-      val pfRcv = Module(new PrefetchReceiver())
-      val bop = Module(new BestOffsetPrefetch()(p.alterPartial({
-        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(BOPParameters()))
-      })))
-      val tp = Module(new TemporalPrefetch()(p.alterPartial({
-        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(TPParameters()))
-      })))
-      val pftQueue = Module(new PrefetchQueue)
-      val pipe = Module(new Pipeline(io.req.bits.cloneType, 1))
-      val l2_pf_en = RegNextN(io_l2_pf_en, 2, Some(true.B))
-
-      // prefetch from upper level
-      pfRcv.io.recv_addr := ValidIODelay(io.recv_addr, 2)
-      pfRcv.io.train.valid := false.B
-      pfRcv.io.train.bits := 0.U.asTypeOf(new PrefetchTrain)
-      pfRcv.io.resp.valid := false.B
-      pfRcv.io.resp.bits := 0.U.asTypeOf(new PrefetchResp)
-      assert(!pfRcv.io.req.valid ||
-       pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
-       pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2Stream.id.U ||
-       pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2Stride.id.U
-      )
-
-      // prefetch from local prefetchers: BOP & TP
-      bop.io.train <> io.train
-      bop.io.resp <> io.resp
-      tp.io.train <> io.train
-      tp.io.resp <> io.resp
-      tp.io.hartid := hartId
-
-      // send to prq
-      pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (bop.io.req.valid || tp.io.req.valid))
-      pftQueue.io.enq.bits := Mux(pfRcv.io.req.valid,
-        pfRcv.io.req.bits,
-        Mux(bop.io.req.valid,
-          bop.io.req.bits,
-          tp.io.req.bits
-        )
-      )
-      pfRcv.io.req.ready := true.B
-      bop.io.req.ready := true.B
-      tp.io.req.ready := !pfRcv.io.req.valid && !bop.io.req.valid
-      pipe.io.in <> pftQueue.io.deq
-      io.req <> pipe.io.out
-
-      // tpmeta interface
-      tpio.tpmeta_port match {
-        case Some(port) => tp.io.tpmeta_port <> port
-        case None => tp.io.tpmeta_port <> DontCare
-      }
-
-      XSPerfAccumulate(cacheParams, "prefetch_req_fromSMS", pfRcv.io.req.valid)
-      XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && bop.io.req.valid)
-      XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
-      XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
-        pfRcv.io.req.valid && l2_pf_en && (bop.io.req.valid || tp.io.req.valid))
-    case _ => assert(cond = false, "Unknown prefetcher")
-  }
-}
+/** *************************************************************************************
+  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
+  * Copyright (c) 2020-2021 Peng Cheng Laboratory
+  *
+  * XiangShan is licensed under Mulan PSL v2.
+  * You can use this software according to the terms and conditions of the Mulan PSL v2.
+  * You may obtain a copy of Mulan PSL v2 at:
+  * http://license.coscl.org.cn/MulanPSL2
+  *
+  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+  *
+  * See the Mulan PSL v2 for more details.
+  * *************************************************************************************
+  */
+
+package coupledL2.prefetch
+
+import chisel3._
+import chisel3.util._
+import utility._
+import org.chipsalliance.cde.config.Parameters
+import freechips.rocketchip.tilelink._
+import coupledL2._
+import coupledL2.utils.{XSPerfAccumulate, XSPerfHistogram}
+
+/* virtual address */
+trait HasPrefetcherHelper extends HasCircularQueuePtrHelper with HasCoupledL2Parameters {
+  // filter
+  val TRAIN_FILTER_SIZE = 4
+  val REQ_FILTER_SIZE = 16
+  val TLB_REPLAY_CNT = 10
+
+  // parameters
+  val BLK_ADDR_RAW_WIDTH = 10
+  val REGION_SIZE = 1024
+  val PAGE_OFFSET = pageOffsetBits
+  val VADDR_HASH_WIDTH = 5
+
+  // vaddr:
+  // |       tag               |     index     |    offset    |
+  // |       block addr                        | block offset |
+  // |       region addr       |        region offset         |
+  val BLOCK_OFFSET = offsetBits
+  val REGION_OFFSET = log2Up(REGION_SIZE)
+  val REGION_BLKS = REGION_SIZE / blockBytes
+  val INDEX_BITS = log2Up(REGION_BLKS)
+  val TAG_BITS = fullVAddrBits - REGION_OFFSET
+  val PTAG_BITS = fullAddressBits - REGION_OFFSET
+  val BLOCK_ADDR_BITS = fullVAddrBits - BLOCK_OFFSET
+
+  // hash related
+  val HASH_TAG_WIDTH = VADDR_HASH_WIDTH + BLK_ADDR_RAW_WIDTH
+
+  def get_tag(vaddr: UInt) = {
+    require(vaddr.getWidth == fullVAddrBits)
+    vaddr(vaddr.getWidth - 1, REGION_OFFSET)
+  }
+
+  def get_ptag(vaddr: UInt) = {
+    require(vaddr.getWidth == fullAddressBits)
+    vaddr(vaddr.getWidth - 1, REGION_OFFSET)
+  }
+
+  def get_index(addr: UInt) = {
+    require(addr.getWidth >= REGION_OFFSET)
+    addr(REGION_OFFSET - 1, BLOCK_OFFSET)
+  }
+
+  def get_index_oh(vaddr: UInt): UInt = {
+    UIntToOH(get_index(vaddr))
+  }
+
+  def get_block_vaddr(vaddr: UInt): UInt = {
+    vaddr(vaddr.getWidth - 1, BLOCK_OFFSET)
+  }
+
+  def _vaddr_hash(x: UInt): UInt = {
+    val width = VADDR_HASH_WIDTH
+    val low = x(width - 1, 0)
+    val mid = x(2 * width - 1, width)
+    val high = x(3 * width - 1, 2 * width)
+    low ^ mid ^ high
+  }
+
+  def block_hash_tag(vaddr: UInt): UInt = {
+    val blk_addr = get_block_vaddr(vaddr)
+    val low = blk_addr(BLK_ADDR_RAW_WIDTH - 1, 0)
+    val high = blk_addr(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH)
+    val high_hash = _vaddr_hash(high)
+    Cat(high_hash, low)
+  }
+
+  def region_hash_tag(vaddr: UInt): UInt = {
+    val region_tag = get_tag(vaddr)
+    val low = region_tag(BLK_ADDR_RAW_WIDTH - 1, 0)
+    val high = region_tag(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH)
+    val high_hash = _vaddr_hash(high)
+    Cat(high_hash, low)
+  }
+
+  def region_to_block_addr(tag: UInt, index: UInt): UInt = {
+    Cat(tag, index)
+  }
+
+  def toBinary(n: Int): String = n match {
+    case 0 | 1 => s"$n"
+    case _ => s"${toBinary(n / 2)}${n % 2}"
+  }
+}
+
+class PrefetchReq(implicit p: Parameters) extends PrefetchBundle {
+  val tag = UInt(fullTagBits.W)
+  val set = UInt(setBits.W)
+  val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
+  val needT = Bool()
+  val source = UInt(sourceIdBits.W)
+  val pfSource = UInt(MemReqSource.reqSourceBits.W)
+
+  def isBOP:Bool = pfSource === MemReqSource.Prefetch2L2BOP.id.U
+  def isPBOP:Bool = pfSource === MemReqSource.Prefetch2L2PBOP.id.U
+  def isSMS:Bool = pfSource === MemReqSource.Prefetch2L2SMS.id.U
+  def isTP:Bool = pfSource === MemReqSource.Prefetch2L2TP.id.U
+  def needAck:Bool = pfSource === MemReqSource.Prefetch2L2BOP.id.U || pfSource === MemReqSource.Prefetch2L2PBOP.id.U
+  def fromL2:Bool =
+    pfSource === MemReqSource.Prefetch2L2BOP.id.U ||
+      pfSource === MemReqSource.Prefetch2L2PBOP.id.U ||
+      pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
+      pfSource === MemReqSource.Prefetch2L2TP.id.U
+}
+
+class PrefetchResp(implicit p: Parameters) extends PrefetchBundle {
+  // val id = UInt(sourceIdBits.W)
+  val tag = UInt(fullTagBits.W)
+  val set = UInt(setBits.W)
+  val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
+  val pfSource = UInt(MemReqSource.reqSourceBits.W)
+
+  def addr = Cat(tag, set, 0.U(offsetBits.W))
+  def isBOP: Bool = pfSource === MemReqSource.Prefetch2L2BOP.id.U
+  def isPBOP: Bool = pfSource === MemReqSource.Prefetch2L2PBOP.id.U
+  def isSMS: Bool = pfSource === MemReqSource.Prefetch2L2SMS.id.U
+  def isTP: Bool = pfSource === MemReqSource.Prefetch2L2TP.id.U
+  def fromL2: Bool =
+    pfSource === MemReqSource.Prefetch2L2BOP.id.U ||
+      pfSource === MemReqSource.Prefetch2L2PBOP.id.U ||
+      pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
+      pfSource === MemReqSource.Prefetch2L2TP.id.U
+}
+
+class PrefetchTrain(implicit p: Parameters) extends PrefetchBundle {
+  val tag = UInt(fullTagBits.W)
+  val set = UInt(setBits.W)
+  val needT = Bool()
+  val source = UInt(sourceIdBits.W)
+  val vaddr = vaddrBitsOpt.map(_ => UInt(vaddrBitsOpt.get.W))
+  val hit = Bool()
+  val prefetched = Bool()
+  val pfsource = UInt(PfSource.pfSourceBits.W)
+  val reqsource = UInt(MemReqSource.reqSourceBits.W)
+
+  def addr: UInt = Cat(tag, set, 0.U(offsetBits.W))
+}
+
+class PrefetchIO(implicit p: Parameters) extends PrefetchBundle {
+  val train = Flipped(DecoupledIO(new PrefetchTrain))
+  val tlb_req = new L2ToL1TlbIO(nRespDups= 1)
+  val req = DecoupledIO(new PrefetchReq)
+  val resp = Flipped(DecoupledIO(new PrefetchResp))
+  val recv_addr = Flipped(ValidIO(new Bundle() {
+    val addr = UInt(64.W)
+    val pfSource = UInt(MemReqSource.reqSourceBits.W)
+  }))
+}
+
+class PrefetchQueue(implicit p: Parameters) extends PrefetchModule {
+  val io = IO(new Bundle {
+    val enq = Flipped(DecoupledIO(new PrefetchReq))
+    val deq = DecoupledIO(new PrefetchReq)
+  })
+  /*  Here we implement a queue that
+   *  1. is pipelined  2. flows
+   *  3. always has the latest reqs, which means the queue is always ready for enq and deserting the eldest ones
+   */
+  val queue = RegInit(VecInit(Seq.fill(inflightEntries)(0.U.asTypeOf(new PrefetchReq))))
+  val valids = RegInit(VecInit(Seq.fill(inflightEntries)(false.B)))
+  val idxWidth = log2Up(inflightEntries)
+  val head = RegInit(0.U(idxWidth.W))
+  val tail = RegInit(0.U(idxWidth.W))
+  val empty = head === tail && !valids.last
+  val full = head === tail && valids.last
+
+  when(!empty && io.deq.ready) {
+    valids(head) := false.B
+    head := head + 1.U
+  }
+
+  when(io.enq.valid) {
+    queue(tail) := io.enq.bits
+    valids(tail) := !empty || !io.deq.ready // true.B
+    tail := tail + (!empty || !io.deq.ready).asUInt
+    when(full && !io.deq.ready) {
+      head := head + 1.U
+    }
+  }
+
+  io.enq.ready := true.B
+  io.deq.valid := !empty || io.enq.valid
+  io.deq.bits := Mux(empty, io.enq.bits, queue(head))
+
+  // The reqs that are discarded = enq - deq
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq",         io.enq.fire)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromBOP", io.enq.fire && io.enq.bits.isBOP)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromPBOP", io.enq.fire && io.enq.bits.isPBOP)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromSMS", io.enq.fire && io.enq.bits.isSMS)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_enq_fromTP",  io.enq.fire && io.enq.bits.isTP)
+
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq",         io.deq.fire)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromBOP", io.deq.fire && io.deq.bits.isBOP)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromPBOP", io.deq.fire && io.deq.bits.isPBOP)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromSMS", io.deq.fire && io.deq.bits.isSMS)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_deq_fromTP",  io.deq.fire && io.deq.bits.isTP)
+
+  XSPerfHistogram(cacheParams, "prefetch_queue_entry", PopCount(valids.asUInt),
+    true.B, 0, inflightEntries, 1)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_empty", empty)
+  XSPerfAccumulate(cacheParams, "prefetch_queue_full", full)
+}
+
+class Prefetcher(implicit p: Parameters) extends PrefetchModule {
+  val io = IO(new PrefetchIO)
+  val tpio = IO(new Bundle() {
+    val tpmeta_port = prefetchOpt match {
+      case Some(param: PrefetchReceiverParams) =>
+        if (param.hasTPPrefetcher) Some(new tpmetaPortIO()) else None
+      case _ => None
+    }
+  })
+  val hartId = IO(Input(UInt(hartIdLen.W)))
+
+  /* io_l2_pf_en:
+   * chicken bits for whether L2 prefetchers are enabled
+   * it will control BOP and TP prefetchers
+   */
+  val io_l2_pf_en = IO(Input(Bool()))
+
+  prefetchOpt.get match {
+    case bop: BOPParameters =>
+      val pft = Module(new VBestOffsetPrefetch)
+      val pftQueue = Module(new PrefetchQueue)
+      val pipe = Module(new Pipeline(io.req.bits.cloneType, 1))
+      pft.io.train <> io.train
+      pft.io.resp <> io.resp
+      pft.io.tlb_req <> io.tlb_req
+      pftQueue.io.enq <> pft.io.req
+      pipe.io.in <> pftQueue.io.deq
+      io.req <> pipe.io.out
+    case receiver: PrefetchReceiverParams =>
+      val pfRcv = Module(new PrefetchReceiver())
+      val pbop = Module(new PBestOffsetPrefetch()(p.alterPartial({
+        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(BOPParameters(
+          virtualTrain = false,
+          badScore = 1,
+          offsetList = Seq(
+            -32, -30, -27, -25, -24, -20, -18, -16, -15,
+            -12, -10, -9, -8, -6, -5, -4, -3, -2, -1,
+            1, 2, 3, 4, 5, 6, 8, 9, 10,
+            12, 15, 16, 18, 20, 24, 25, 27, 30
+          ))))
+      })))
+      val vbop = Module(new VBestOffsetPrefetch()(p.alterPartial({
+        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(BOPParameters(
+          badScore = 2,
+          offsetList = Seq(
+            -117,-147,-91,117,147,91,
+            -256, -250, -243, -240, -225, -216, -200,
+            -192, -180, -162, -160, -150, -144, -135, -128,
+            -125, -120, -108, -100, -96, -90, -81, -80,
+            -75, -72, -64, -60, -54, -50, -48, -45,
+            -40, -36, -32, -30, -27, -25, -24, -20,
+            -18, -16, -15, -12, -10, -9, -8, -6,
+            -5, -4, -3, -2, -1,
+            1, 2, 3, 4, 5, 6, 8,
+            9, 10, 12, 15, 16, 18, 20, 24,
+            25, 27, 30, 32, 36, 40, 45, 48,
+            50, 54, 60, 64, 72, 75, 80, 81,
+            90, 96, 100, 108, 120, 125, 128, 135,
+            144, 150, 160, 162, 180, 192, 200, 216,
+            225, 240, 243, 250/*, 256*/
+          )
+        )))
+      })))
+      val tp = Module(new TemporalPrefetch()(p.alterPartial({
+        case L2ParamKey => p(L2ParamKey).copy(prefetch = Some(TPParameters()))
+      })))
+      val pftQueue = Module(new PrefetchQueue)
+      val pipe = Module(new Pipeline(io.req.bits.cloneType, 1))
+      val l2_pf_en = RegNextN(io_l2_pf_en, 2, Some(true.B))
+
+      // prefetch from upper level
+      pfRcv.io.recv_addr := ValidIODelay(io.recv_addr, 2)
+      pfRcv.io.train.valid := false.B
+      pfRcv.io.train.bits := 0.U.asTypeOf(new PrefetchTrain)
+      pfRcv.io.resp.valid := false.B
+      pfRcv.io.resp.bits := 0.U.asTypeOf(new PrefetchResp)
+      pfRcv.io.tlb_req.req.ready := true.B
+      pfRcv.io.tlb_req.resp.valid := false.B
+      pfRcv.io.tlb_req.resp.bits := DontCare
+      assert(!pfRcv.io.req.valid ||
+        pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2SMS.id.U ||
+        pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2Stream.id.U ||
+        pfRcv.io.req.bits.pfSource === MemReqSource.Prefetch2L2Stride.id.U
+      )
+
+      // prefetch from local prefetchers: BOP & TP
+      vbop.io.train <> io.train
+      vbop.io.train.valid := io.train.valid && (io.train.bits.reqsource =/= MemReqSource.L1DataPrefetch.id.U)
+      vbop.io.resp <> io.resp
+      vbop.io.resp.valid := io.resp.valid && io.resp.bits.isBOP
+      vbop.io.tlb_req <> io.tlb_req
+      vbop.io.pbopCrossPage := true.B // pbop.io.pbopCrossPage // let vbop have noting to do with pbop
+
+      pbop.io.train <> io.train
+      pbop.io.train.valid := io.train.valid && (io.train.bits.reqsource =/= MemReqSource.L1DataPrefetch.id.U)
+      pbop.io.resp <> io.resp
+      pbop.io.resp.valid := io.resp.valid && io.resp.bits.isPBOP
+      tp.io.train <> io.train
+      tp.io.resp <> io.resp
+      tp.io.hartid := hartId
+
+      pfRcv.io.req.ready := true.B
+      vbop.io.req.ready := true.B
+      pbop.io.req.ready := true.B
+      tp.io.req.ready := !pfRcv.io.req.valid && !vbop.io.req.valid
+      pipe.io.in <> pftQueue.io.deq
+      io.req <> pipe.io.out
+
+      // tpmeta interface
+      tp.io.tpmeta_port <> tpio.tpmeta_port.get
+
+      /* pri vbop */
+      pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || tp.io.req.valid))
+      pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
+        pfRcv.io.req.valid -> pfRcv.io.req.bits,
+        vbop.io.req.valid -> vbop.io.req.bits,
+        pbop.io.req.valid -> pbop.io.req.bits,
+        tp.io.req.valid -> tp.io.req.bits
+      ))
+      XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && vbop.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && pbop.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.io.req.valid)
+      XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
+        pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+
+      /* pri pbop */
+      // pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || pbop.io.req.valid || tp.io.req.valid))
+      // pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
+      //   pfRcv.io.req.valid -> pfRcv.io.req.bits,
+      //   pbop.io.req.valid -> pbop.io.req.bits,
+      //   vbop.io.req.valid -> vbop.io.req.bits,
+      //   tp.io.req.valid -> tp.io.req.bits
+      // ))
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && pbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && !pbop.io.req.valid && vbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && !pbop.io.req.valid && tp.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
+      //   pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+
+      /* solo vbop */
+      //  vbop.io.pbopCrossPage := true.B
+      //  pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+      //  pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
+      //    pfRcv.io.req.valid -> pfRcv.io.req.bits,
+      //    vbop.io.req.valid -> vbop.io.req.bits,
+      //    tp.io.req.valid -> tp.io.req.bits
+      //  ))
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_fromBOP", l2_pf_en && vbop.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_selectBOP", l2_pf_en && !pfRcv.io.req.valid && vbop.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !vbop.io.req.valid && tp.io.req.valid)
+      //  XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
+      //    pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+
+      /* solo pbop */
+      // vbop.io.train.valid := false.B
+      // vbop.io.resp.valid := false.B
+      // pftQueue.io.enq.valid := pfRcv.io.req.valid || (l2_pf_en && (pbop.io.req.valid || tp.io.req.valid))
+      // pftQueue.io.enq.bits := ParallelPriorityMux(Seq(
+      //   pfRcv.io.req.valid -> pfRcv.io.req.bits,
+      //   pbop.io.req.valid -> pbop.io.req.bits,
+      //   tp.io.req.valid -> tp.io.req.bits
+      // ))
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromL1", l2_pf_en && pfRcv.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromPBOP", l2_pf_en && pbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_fromTP", l2_pf_en && tp.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectL1", l2_pf_en && pfRcv.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectPBOP", l2_pf_en && !pfRcv.io.req.valid && pbop.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_selectTP", l2_pf_en && !pfRcv.io.req.valid && !pbop.io.req.valid && tp.io.req.valid)
+      // XSPerfAccumulate(cacheParams, "prefetch_req_SMS_other_overlapped",
+      //   pfRcv.io.req.valid && l2_pf_en && (vbop.io.req.valid || tp.io.req.valid))
+
+    case _ => assert(cond = false, "Unknown prefetcher")
+  }
+}
diff --git a/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala b/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala
index b0f3d181c..ff8c5e4fb 100644
--- a/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala
+++ b/src/main/scala/coupledL2/prefetch/TemporalPrefetch.scala
@@ -344,6 +344,7 @@ class TemporalPrefetch(implicit p: Parameters) extends TPModule {
   io.req.valid := Mux(enableTP.orR, sending_valid, false.B)
   io.req.bits.tag := sendingTag
   io.req.bits.set := sendingSet
+  io.req.bits.vaddr.foreach(_ := 0.U)
   io.req.bits.needT := true.B
   io.req.bits.source := 0.U // TODO: ensure source 0 is dcache
   io.req.bits.pfSource := MemReqSource.Prefetch2L2TP.id.U
diff --git a/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala b/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala
index 96dbbc915..21d441c27 100644
--- a/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala
+++ b/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala
@@ -19,7 +19,7 @@ package coupledL2.tl2chi
 
 import chisel3._
 import chisel3.util._
-import utility.{FastArbiter, Pipeline}
+import utility.{FastArbiter, Pipeline, ParallelPriorityMux}
 import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.tilelink._
 import freechips.rocketchip.tilelink.TLMessages._
@@ -120,11 +120,18 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
   class CoupledL2Imp(wrapper: LazyModule) extends LazyModuleImp(wrapper) {
     val banks = node.in.size
     val bankBits = log2Ceil(banks)
+    val l2TlbParams: Parameters = p.alterPartial {
+      case EdgeInKey => node.in.head._2
+      case EdgeOutKey => node.out.head._2
+      case BankBitsKey => bankBits
+    }
 
     val io = IO(new Bundle {
       val hartId = Input(UInt(hartIdLen.W))
       val l2_hint = ValidIO(new L2ToL1Hint())
+      val l2_tlb_req = new L2ToL1TlbIO(nRespDups = 1)(l2TlbParams)
       val debugTopDown = new Bundle {
+        val robTrueCommit = Input(UInt(64.W))
         val robHeadPaddr = Vec(cacheParams.hartIds.length, Flipped(Valid(UInt(36.W))))
         val l2MissMatch = Vec(cacheParams.hartIds.length, Output(Bool()))
       }
@@ -132,9 +139,6 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
       val nodeID = Input(UInt())
     })
 
-    // TODO
-    io.debugTopDown <> DontCare
-
     // Display info
     val sizeBytes = cacheParams.toCacheParams.capacity.toDouble
     val sizeStr = sizeBytesToStr(sizeBytes)
@@ -178,12 +182,14 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
     val prefetchTrains = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchTrain()(pftParams)))))
     val prefetchResps = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchResp()(pftParams)))))
     val prefetchReqsReady = WireInit(VecInit(Seq.fill(banks)(false.B)))
+    io.l2_tlb_req <> DontCare // TODO: l2_tlb_req should be Option
     prefetchOpt.foreach {
       _ =>
         fastArb(prefetchTrains.get, prefetcher.get.io.train, Some("prefetch_train"))
         prefetcher.get.io.req.ready := Cat(prefetchReqsReady).orR
         prefetcher.get.hartId := io.hartId
         fastArb(prefetchResps.get, prefetcher.get.io.resp, Some("prefetch_resp"))
+        prefetcher.get.io.tlb_req <> io.l2_tlb_req
     }
     pf_recv_node match {
       case Some(x) =>
@@ -211,9 +217,19 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
         RegNextN(data, n - 1)
     }
 
-    val hint_chosen = Wire(UInt(node.in.size.W))
-    val hint_fire = Wire(Bool())
-    val release_sourceD_condition = Wire(Vec(node.in.size, Bool()))
+    // ** WARNING:TODO: this depends on where the latch is
+    // ** if Hint latched in slice, while D-Channel latched in XSTile
+    // ** we need only [hintCycleAhead - 1] later
+    val sliceAhead = hintCycleAhead - 1
+
+    val hintChosen = Wire(UInt(banks.W))
+    val hintFire = Wire(Bool())
+
+    // if Hint indicates that this slice should fireD, yet no D resp comes out of this slice
+    // then we releaseSourceD, enabling io.d.ready for other slices
+    // TODO: if Hint for single slice is 100% accurate, may consider remove this
+    val releaseSourceD = Wire(Vec(banks, Bool()))
+    val allCanFire = (RegNextN(!hintFire, sliceAhead) && RegNextN(!hintFire, sliceAhead + 1)) || Cat(releaseSourceD).orR
 
     val slices = node.in.zipWithIndex.map {
       case ((in, edgeIn), i) =>
@@ -225,18 +241,21 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
             case SliceIdKey => i
           }))
         }
-        val sourceD_can_go = RegNextN(!hint_fire || i.U === OHToUInt(hint_chosen), hintCycleAhead - 1)
-        release_sourceD_condition(i) := sourceD_can_go && !slice.io.in.d.valid
         slice.io.in <> in
-        if(enableHintGuidedGrant) {
-          // If the hint of slice X is selected in T cycle, then in T + 3 cycle we will try our best to select the grant of slice X.
-          // If slice X has no grant in T + 3 cycle, it means that the hint of T cycle is wrong, so relax the restriction on grant selection.
-          // Timing will be worse if enabled
-          in.d.valid := slice.io.in.d.valid && (sourceD_can_go || Cat(release_sourceD_condition).orR)
-          slice.io.in.d.ready := in.d.ready && (sourceD_can_go || Cat(release_sourceD_condition).orR)
+        if (enableHintGuidedGrant) {
+          // If the hint of slice X is selected at cycle T, then at cycle (T + 3) & (T + 4)
+          // we will try our best to select the grant of slice X.
+          // If slice X has no grant then, it means that the hint at cycle T is wrong,
+          // so we relax the restriction on grant selection.
+          val sliceCanFire = RegNextN(hintFire && i.U === hintChosen, sliceAhead) ||
+            RegNextN(hintFire && i.U === hintChosen, sliceAhead + 1)
+
+          releaseSourceD(i) := sliceCanFire && !slice.io.in.d.valid
+
+          in.d.valid := slice.io.in.d.valid && (sliceCanFire || allCanFire)
+          slice.io.in.d.ready := in.d.ready && (sliceCanFire || allCanFire)
         }
         in.b.bits.address := restoreAddress(slice.io.in.b.bits.address, i)
-        slice.io.out <> DontCare
         slice.io.sliceId := i.U
 
         slice.io.prefetch.zip(prefetcher).foreach {
@@ -263,28 +282,39 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
               prefetchResps.get(i).bits.tag := resp_tag
               prefetchResps.get(i).bits.set := resp_set
             }
+            s.tlb_req.req.valid := false.B
+            s.tlb_req.req.bits := DontCare
+            s.tlb_req.req_kill := DontCare
+            s.tlb_req.resp.ready := true.B
         }
 
         slice
     }
-    val l1Hint_arb = Module(new Arbiter(new L2ToL1Hint, slices.size))
-    val slices_l1Hint = slices.zipWithIndex.map {
-      case (s, i) => Pipeline(s.io.l1Hint, depth = 1, pipe = false, name = Some(s"l1Hint_buffer_$i"))
+
+    if (enableHintGuidedGrant) {
+      // for timing consideration, hint should latch one cycle before sending to L1
+      // instead of adding a Pipeline/Queue to latch here, we just set hintQueue in GrantBuf & CustomL1Hint "flow=false"
+      val l1HintArb = Module(new Arbiter(new L2ToL1Hint(), slices.size))
+      val slices_l1Hint = slices.zipWithIndex.map {
+        case (s, i) => s.io.l1Hint
+      }
+      // should only Hint for DCache
+      val (sourceIsDcache, dcacheSourceIdStart) = node.in.head._2.client.clients
+        .filter(_.supports.probe)
+        .map(c => {
+          (c.sourceId.contains(l1HintArb.io.out.bits.sourceId).asInstanceOf[Bool], c.sourceId.start.U)
+        }).head
+
+      l1HintArb.io.in <> VecInit(slices_l1Hint)
+      io.l2_hint.valid := l1HintArb.io.out.fire && sourceIsDcache
+      io.l2_hint.bits.sourceId := l1HintArb.io.out.bits.sourceId - dcacheSourceIdStart
+      io.l2_hint.bits.isKeyword := l1HintArb.io.out.bits.isKeyword
+      // continuous hints can only be sent every two cycle, since GrantData takes two cycles
+      l1HintArb.io.out.ready := !RegNext(io.l2_hint.valid, false.B)
+
+      hintChosen := l1HintArb.io.chosen // ! THIS IS NOT ONE-HOT !
+      hintFire := io.l2_hint.valid
     }
-    val (client_sourceId_match_oh, client_sourceId_start) = node.in.head._2.client.clients
-                                                          .map(c => {
-                                                                (c.sourceId.contains(l1Hint_arb.io.out.bits.sourceId).asInstanceOf[Bool], c.sourceId.start.U)
-                                                              })
-                                                          .unzip
-    l1Hint_arb.io.in <> VecInit(slices_l1Hint)
-    io.l2_hint.valid := l1Hint_arb.io.out.fire
-    io.l2_hint.bits.sourceId := l1Hint_arb.io.out.bits.sourceId - Mux1H(client_sourceId_match_oh, client_sourceId_start)
-    io.l2_hint.bits.isKeyword := l1Hint_arb.io.out.bits.isKeyword
-    // always ready for grant hint
-    l1Hint_arb.io.out.ready := true.B
-
-    hint_chosen := l1Hint_arb.io.chosen
-    hint_fire := io.l2_hint.valid
 
     /**
       * TxnID space arrangement:
@@ -414,32 +444,55 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
     io.chi <> linkMonitor.io.out
     linkMonitor.io.nodeID := io.nodeID
 
+    // ==================== TopDown ====================
     val topDown = topDownOpt.map(_ => Module(new TopDownMonitor()(p.alterPartial {
       case EdgeInKey => node.in.head._2
       case BankBitsKey => bankBits
     })))
-    // topDown match {
-    //   case Some(t) =>
-    //     t.io.msStatus.zip(slices).foreach {
-    //       case (in, s) => in := s.io.msStatus.get
-    //     }
-    //     t.io.dirResult.zip(slices).foreach {
-    //       case (res, s) => res := s.io.dirResult.get
-    //     }
-    //     t.io.latePF.zip(slices).foreach {
-    //       case (in, s) => in := s.io.latePF.get
-    //     }
-    //     t.io.debugTopDown <> io.debugTopDown
-    //   case None => io.debugTopDown.l2MissMatch.foreach(_ := false.B)
-    // }
-    topDown.foreach(_.io <> DontCare) // TODO
-
-    XSPerfAccumulate(cacheParams, "hint_fire", io.l2_hint.valid)
-    val grant_fire = slices.map{ slice => {
-                        val (_, _, grant_fire_last, _) = node.in.head._2.count(slice.io.in.d)
-                        slice.io.in.d.fire && grant_fire_last && slice.io.in.d.bits.opcode === GrantData
-                      }}
-    XSPerfAccumulate(cacheParams, "grant_data_fire", PopCount(VecInit(grant_fire)))
+    topDown match {
+      case Some(t) =>
+        t.io.msStatus.zip(slices).foreach {
+          case (in, s) => in := s.io.msStatus.get
+        }
+        t.io.dirResult.zip(slices).foreach {
+          case (res, s) => res := s.io.dirResult.get
+        }
+        t.io.latePF.zip(slices).foreach {
+          case (in, s) => in := s.io.latePF.get
+        }
+        t.io.debugTopDown <> io.debugTopDown
+      case None => io.debugTopDown.l2MissMatch.foreach(_ := false.B)
+    }
+
+    // ==================== XSPerf Counters ====================
+    val grant_data_fire = slices.map { slice => {
+      val (first, _, _, _) = node.in.head._2.count(slice.io.in.d)
+      slice.io.in.d.fire && first && slice.io.in.d.bits.opcode === GrantData
+    }
+    }
+    XSPerfAccumulate(cacheParams, "grant_data_fire", PopCount(VecInit(grant_data_fire)))
+
+    val hint_source = io.l2_hint.bits.sourceId
+
+    val grant_data_source = ParallelPriorityMux(slices.map {
+      s => (s.io.in.d.fire, s.io.in.d.bits.source)
+    })
+
+    val hintPipe2 = Module(new Pipeline(UInt(32.W), 2))
+    hintPipe2.io.in.valid := io.l2_hint.valid
+    hintPipe2.io.in.bits := hint_source
+    hintPipe2.io.out.ready := true.B
+
+    val hintPipe1 = Module(new Pipeline(UInt(32.W), 1))
+    hintPipe1.io.in.valid := io.l2_hint.valid
+    hintPipe1.io.in.bits := hint_source
+    hintPipe1.io.out.ready := true.B
+
+    val accurateHint = grant_data_fire.orR && hintPipe2.io.out.valid && hintPipe2.io.out.bits === grant_data_source
+    XSPerfAccumulate(cacheParams, "accurate3Hints", accurateHint)
+
+    val okHint = grant_data_fire.orR && hintPipe1.io.out.valid && hintPipe1.io.out.bits === grant_data_source
+    XSPerfAccumulate(cacheParams, "ok2Hints", okHint)
   }
 
   lazy val module = new CoupledL2Imp(this)
diff --git a/src/main/scala/coupledL2/tl2tl/MSHR.scala b/src/main/scala/coupledL2/tl2tl/MSHR.scala
index 8e42a5b5e..d503ba62f 100644
--- a/src/main/scala/coupledL2/tl2tl/MSHR.scala
+++ b/src/main/scala/coupledL2/tl2tl/MSHR.scala
@@ -166,7 +166,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
     mp_release.set := req.set
     mp_release.off := 0.U
     mp_release.alias.foreach(_ := 0.U)
-    mp_release.vaddr.foreach(_ := 0.U)
+    mp_release.vaddr.foreach(_ := req.vaddr.getOrElse(0.U))
     mp_release.isKeyword.foreach(_ := false.B)
     // if dirty, we must ReleaseData
     // if accessed, we ReleaseData to keep the data in L3, for future access to be faster
@@ -214,7 +214,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
     mp_probeack.set := req.set
     mp_probeack.off := req.off
     mp_probeack.alias.foreach(_ := 0.U)
-    mp_probeack.vaddr.foreach(_ := 0.U)
+    mp_probeack.vaddr.foreach(_ := req.vaddr.getOrElse(0.U))
     mp_probeack.isKeyword.foreach(_ := false.B)
     mp_probeack.opcode := Mux(
       meta.dirty && isT(meta.state) || probeDirty || req.needProbeAckData,
@@ -284,7 +284,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
     mp_grant.off := req.off
     mp_grant.sourceId := req.sourceId
     mp_grant.alias.foreach(_ := 0.U)
-    mp_grant.vaddr.foreach(_ := 0.U)
+    mp_grant.vaddr.foreach(_ := req.vaddr.getOrElse(0.U))
     mp_grant.isKeyword.foreach(_ := req.isKeyword.getOrElse(false.B))
     mp_grant.opcode := odOpGen(req.opcode)
     mp_grant.param := Mux(
diff --git a/src/main/scala/coupledL2/tl2tl/Slice.scala b/src/main/scala/coupledL2/tl2tl/Slice.scala
index 70ea2c848..019852c00 100644
--- a/src/main/scala/coupledL2/tl2tl/Slice.scala
+++ b/src/main/scala/coupledL2/tl2tl/Slice.scala
@@ -143,6 +143,9 @@ class Slice()(implicit p: Parameters) extends L2Module {
       p.train <> mainPipe.io.prefetchTrain.get
       sinkA.io.prefetchReq.get <> p.req
       p.resp <> grantBuf.io.prefetchResp.get
+      p.tlb_req.req.ready := true.B
+      p.tlb_req.resp.valid := false.B
+      p.tlb_req.resp.bits := DontCare
       p.recv_addr := 0.U.asTypeOf(p.recv_addr)
   }
 
diff --git a/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala b/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala
index 91f60abe4..f3eb6b281 100644
--- a/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala
+++ b/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala
@@ -19,7 +19,7 @@ package coupledL2.tl2tl
 
 import chisel3._
 import chisel3.util._
-import utility.{FastArbiter, Pipeline}
+import utility.{FastArbiter, Pipeline, ParallelPriorityMux}
 import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.tilelink._
 import freechips.rocketchip.tilelink.TLMessages._
@@ -94,11 +94,18 @@ class TL2TLCoupledL2(implicit p: Parameters) extends CoupledL2Base {
 
     val banks = node.in.size
     val bankBits = if (banks == 1) 0 else log2Up(banks)
+    val l2TlbParams: Parameters = p.alterPartial {
+      case EdgeInKey => node.in.head._2
+      case EdgeOutKey => node.out.head._2
+      case BankBitsKey => bankBits
+    }
     val io = IO(new Bundle {
       val hartId = Input(UInt(hartIdLen.W))
     //  val l2_hint = Valid(UInt(32.W))
       val l2_hint = ValidIO(new L2ToL1Hint())
+      val l2_tlb_req = new L2ToL1TlbIO(nRespDups = 1)(l2TlbParams)
       val debugTopDown = new Bundle {
+        val robTrueCommit = Input(UInt(64.W))
         val robHeadPaddr = Vec(cacheParams.hartIds.length, Flipped(Valid(UInt(36.W))))
         val l2MissMatch = Vec(cacheParams.hartIds.length, Output(Bool()))
       }
@@ -140,6 +147,7 @@ class TL2TLCoupledL2(implicit p: Parameters) extends CoupledL2Base {
         prefetcher.get.io.req.ready := Cat(prefetchReqsReady).orR
         prefetcher.get.hartId := io.hartId
         fastArb(prefetchResps.get, prefetcher.get.io.resp, Some("prefetch_resp"))
+        prefetcher.get.io.tlb_req <> io.l2_tlb_req
     }
     pf_recv_node match {
       case Some(x) =>
@@ -177,9 +185,19 @@ class TL2TLCoupledL2(implicit p: Parameters) extends CoupledL2Base {
         RegNextN(data, n - 1)
     }
 
-    val hint_chosen = Wire(UInt(node.in.size.W))
-    val hint_fire = Wire(Bool())
-    val release_sourceD_condition = Wire(Vec(node.in.size, Bool()))
+    // ** WARNING:TODO: this depends on where the latch is
+    // ** if Hint latched in slice, while D-Channel latched in XSTile
+    // ** we need only [hintCycleAhead - 1] later
+    val sliceAhead = hintCycleAhead - 1
+
+    val hintChosen = Wire(UInt(banks.W))
+    val hintFire = Wire(Bool())
+
+    // if Hint indicates that this slice should fireD, yet no D resp comes out of this slice
+    // then we releaseSourceD, enabling io.d.ready for other slices
+    // TODO: if Hint for single slice is 100% accurate, may consider remove this
+    val releaseSourceD = Wire(Vec(node.in.size, Bool()))
+    val allCanFire = (RegNextN(!hintFire, sliceAhead) && RegNextN(!hintFire, sliceAhead + 1)) || Cat(releaseSourceD).orR
 
     val slices = node.in.zip(node.out).zipWithIndex.map {
       case (((in, edgeIn), (out, edgeOut)), i) =>
@@ -193,15 +211,19 @@ class TL2TLCoupledL2(implicit p: Parameters) extends CoupledL2Base {
             case SliceIdKey => i
           }))
         }
-        val sourceD_can_go = RegNextN(!hint_fire || i.U === OHToUInt(hint_chosen), hintCycleAhead - 1)
-        release_sourceD_condition(i) := sourceD_can_go && !slice.io.in.d.valid
         slice.io.in <> in
         if(enableHintGuidedGrant) {
-          // If the hint of slice X is selected in T cycle, then in T + 3 cycle we will try our best to select the grant of slice X.
-          // If slice X has no grant in T + 3 cycle, it means that the hint of T cycle is wrong, so relax the restriction on grant selection.
-          // Timing will be worse if enabled
-          in.d.valid := slice.io.in.d.valid && (sourceD_can_go || Cat(release_sourceD_condition).orR)
-          slice.io.in.d.ready := in.d.ready && (sourceD_can_go || Cat(release_sourceD_condition).orR)
+          // If the hint of slice X is selected at cycle T, then at cycle (T + 3) & (T + 4)
+          // we will try our best to select the grant of slice X.
+          // If slice X has no grant then, it means that the hint at cycle T is wrong,
+          // so we relax the restriction on grant selection.
+          val sliceCanFire = RegNextN(hintFire && i.U === hintChosen, sliceAhead) ||
+            RegNextN(hintFire && i.U === hintChosen, sliceAhead + 1)
+          
+          releaseSourceD(i) := sliceCanFire && !slice.io.in.d.valid
+
+          in.d.valid := slice.io.in.d.valid && (sliceCanFire || allCanFire)
+          slice.io.in.d.ready := in.d.ready && (sliceCanFire || allCanFire)
         }
         in.b.bits.address := restoreAddress(slice.io.in.b.bits.address, i)
         out <> slice.io.out
@@ -233,29 +255,41 @@ class TL2TLCoupledL2(implicit p: Parameters) extends CoupledL2Base {
               prefetchResps.get(i).bits.tag := resp_tag
               prefetchResps.get(i).bits.set := resp_set
             }
+            s.tlb_req.req.valid := false.B
+            s.tlb_req.req.bits := DontCare
+            s.tlb_req.req_kill := DontCare
+            s.tlb_req.resp.ready := true.B
         }
 
         slice
     }
-    val l1Hint_arb = Module(new Arbiter(new L2ToL1Hint, slices.size))
-    val slices_l1Hint = slices.zipWithIndex.map {
-      case (s, i) => Pipeline(s.io.l1Hint, depth = 1, pipe = false, name = Some(s"l1Hint_buffer_$i"))
+
+    if (enableHintGuidedGrant) {
+      // for timing consideration, hint should latch one cycle before sending to L1
+      // instead of adding a Pipeline/Queue to latch here, we just set hintQueue in GrantBuf & CustomL1Hint "flow=false"
+      val l1HintArb = Module(new Arbiter(new L2ToL1Hint(), slices.size))
+      val slices_l1Hint = slices.zipWithIndex.map {
+        case (s, i) => s.io.l1Hint
+      }
+      // should only Hint for DCache
+      val (sourceIsDcache, dcacheSourceIdStart) = node.in.head._2.client.clients
+        .filter(_.supports.probe)
+        .map(c => {
+          (c.sourceId.contains(l1HintArb.io.out.bits.sourceId).asInstanceOf[Bool], c.sourceId.start.U)
+        }).head
+
+      l1HintArb.io.in <> VecInit(slices_l1Hint)
+      io.l2_hint.valid := l1HintArb.io.out.fire && sourceIsDcache
+      io.l2_hint.bits.sourceId := l1HintArb.io.out.bits.sourceId - dcacheSourceIdStart
+      io.l2_hint.bits.isKeyword := l1HintArb.io.out.bits.isKeyword
+      // continuous hints can only be sent every two cycle, since GrantData takes two cycles
+      l1HintArb.io.out.ready := !RegNext(io.l2_hint.valid, false.B)
+
+      hintChosen := l1HintArb.io.chosen // ! THIS IS NOT ONE-HOT !
+      hintFire := io.l2_hint.valid
     }
-    val (client_sourceId_match_oh, client_sourceId_start) = node.in.head._2.client.clients
-                                                          .map(c => {
-                                                                (c.sourceId.contains(l1Hint_arb.io.out.bits.sourceId).asInstanceOf[Bool], c.sourceId.start.U)
-                                                              })
-                                                          .unzip
-    l1Hint_arb.io.in <> VecInit(slices_l1Hint)
-    io.l2_hint.valid := l1Hint_arb.io.out.fire
-    io.l2_hint.bits.sourceId := l1Hint_arb.io.out.bits.sourceId - Mux1H(client_sourceId_match_oh, client_sourceId_start)
-    io.l2_hint.bits.isKeyword := l1Hint_arb.io.out.bits.isKeyword
-    // always ready for grant hint
-    l1Hint_arb.io.out.ready := true.B
-
-    hint_chosen := l1Hint_arb.io.chosen
-    hint_fire := io.l2_hint.valid
 
+    // ==================== TopDown ====================
     val topDown = topDownOpt.map(_ => Module(new TopDownMonitor()(p.alterPartial {
       case EdgeInKey => node.in.head._2
       case EdgeOutKey => node.out.head._2
@@ -276,12 +310,35 @@ class TL2TLCoupledL2(implicit p: Parameters) extends CoupledL2Base {
       case None => io.debugTopDown.l2MissMatch.foreach(_ := false.B)
     }
 
-    XSPerfAccumulate(cacheParams, "hint_fire", io.l2_hint.valid)
-    val grant_fire = slices.map{ slice => {
-                        val (_, _, grant_fire_last, _) = node.in.head._2.count(slice.io.in.d)
-                        slice.io.in.d.fire && grant_fire_last && slice.io.in.d.bits.opcode === GrantData
-                      }}
-    XSPerfAccumulate(cacheParams, "grant_data_fire", PopCount(VecInit(grant_fire)))
+    // ==================== XSPerf Counters ====================
+    val grant_data_fire = slices.map { slice => {
+      val (first, _, _, _) = node.in.head._2.count(slice.io.in.d)
+      slice.io.in.d.fire && first && slice.io.in.d.bits.opcode === GrantData
+    }
+    }
+    XSPerfAccumulate(cacheParams, "grant_data_fire", PopCount(VecInit(grant_data_fire)))
+
+    val hint_source = io.l2_hint.bits.sourceId
+
+    val grant_data_source = ParallelPriorityMux(slices.map {
+      s => (s.io.in.d.fire, s.io.in.d.bits.source)
+    })
+
+    val hintPipe2 = Module(new Pipeline(UInt(32.W), 2))
+    hintPipe2.io.in.valid := io.l2_hint.valid
+    hintPipe2.io.in.bits := hint_source
+    hintPipe2.io.out.ready := true.B
+
+    val hintPipe1 = Module(new Pipeline(UInt(32.W), 1))
+    hintPipe1.io.in.valid := io.l2_hint.valid
+    hintPipe1.io.in.bits := hint_source
+    hintPipe1.io.out.ready := true.B
+
+    val accurateHint = grant_data_fire.orR && hintPipe2.io.out.valid && hintPipe2.io.out.bits === grant_data_source
+    XSPerfAccumulate(cacheParams, "accurate3Hints", accurateHint)
+
+    val okHint = grant_data_fire.orR && hintPipe1.io.out.valid && hintPipe1.io.out.bits === grant_data_source
+    XSPerfAccumulate(cacheParams, "ok2Hints", okHint)
   }
 
   lazy val module = new CoupledL2Imp(this)
diff --git a/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala b/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala
index ab4e4fda8..10dd6fef2 100644
--- a/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala
+++ b/src/main/scala/coupledL2/utils/L2PerfCounterUtils.scala
@@ -190,6 +190,39 @@ object XSPerfRolling {
       rollingTable.log(rollingPt, triggerDB, "", clock, reset)
     }
   }
+
+  // event interval based mode
+  def apply(
+    params: L2Param,
+    perfName: String,
+    perfCntX: UInt,
+    perfCntY: UInt,
+    granularity: Int,
+    eventTrigger: UInt,
+    clock: Clock,
+    reset: Reset
+  ): Unit = {
+    if (params.enablePerf && !params.FPGAPlatform) {
+      val tableName = perfName + "_rolling_0"
+      val rollingTable = ChiselDB.createTable(tableName, new RollingEntry(), basicDB = true)
+
+      val xAxisCnt = RegInit(0.U(64.W))
+      val yAxisCnt = RegInit(0.U(64.W))
+      val eventCnt = RegInit(0.U(64.W))
+      xAxisCnt := xAxisCnt + perfCntX
+      yAxisCnt := yAxisCnt + perfCntY
+      eventCnt := eventCnt + eventTrigger
+
+      val triggerDB = eventCnt >= granularity.U
+      when(triggerDB) {
+        eventCnt := eventTrigger
+        xAxisCnt := perfCntX
+        yAxisCnt := perfCntY
+      }
+      val rollingPt = new RollingEntry().apply(xAxisCnt, yAxisCnt)
+      rollingTable.log(rollingPt, triggerDB, "", clock, reset)
+    }
+  }
 }
 
 object TransactionLatencyCounter {
diff --git a/src/test/scala/TestTop.scala b/src/test/scala/TestTop.scala
index d5182f02a..71b0ff182 100644
--- a/src/test/scala/TestTop.scala
+++ b/src/test/scala/TestTop.scala
@@ -399,6 +399,7 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule {
       case l2 => {
         l2.module.io.debugTopDown := DontCare
         l2.module.io.hartId := DontCare
+        l2.module.io.l2_tlb_req <> DontCare
       }
     }
 
diff --git a/src/test/scala/chi/TestTop.scala b/src/test/scala/chi/TestTop.scala
index 4ce9a9ac1..6d47abd34 100644
--- a/src/test/scala/chi/TestTop.scala
+++ b/src/test/scala/chi/TestTop.scala
@@ -151,6 +151,7 @@ class TestTop_CHIL2(numCores: Int = 1, numULAgents: Int = 0, banks: Int = 1)(imp
       l2.module.io.hartId := i.U
       l2.module.io.nodeID := i.U(NODEID_WIDTH.W)
       l2.module.io.debugTopDown := DontCare
+      l2.module.io.l2_tlb_req <> DontCare
     }
   }
 
diff --git a/utility b/utility
index 1b7acf099..92b7cfbba 160000
--- a/utility
+++ b/utility
@@ -1 +1 @@
-Subproject commit 1b7acf0998ddf175527aa0609788c3fea1262b1f
+Subproject commit 92b7cfbbaacfda6c9ff691e12c48421d9f6d0f99

From 5dc3553214bf0b3b40358835a763d25293fe6894 Mon Sep 17 00:00:00 2001
From: zhanglinjuan <zhanglinjuan16@mails.ucas.ac.cn>
Date: Tue, 7 May 2024 21:46:25 +0800
Subject: [PATCH 9/9] workflows: add CI for TL2TLCoupledL2 unit test

---
 .github/workflows/main.yml                    | 22 ++++++++++++++-----
 src/main/scala/coupledL2/tl2tl/MSHR.scala     |  5 +++++
 src/main/scala/coupledL2/tl2tl/MainPipe.scala |  3 +++
 src/main/scala/coupledL2/tl2tl/SinkB.scala    |  3 +++
 .../coupledL2/tl2tl/TL2TLCoupledL2.scala      |  1 +
 src/test/scala/TestTop.scala                  | 11 ++++++++--
 6 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 43cfa2a37..d61d8069f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -6,9 +6,9 @@ name: CI
 on:
   # Triggers the workflow on push or pull request events but only for the main branch
   push:
-    branches: [ chi-coupledl2, chi-coupledl2-ci-test ]
+    branches: [ master, chi-coupledl2 ]
   pull_request:
-    branches: [ chi-coupledl2, chi-coupledl2-ci-test ]
+    branches: [ master, chi-coupledl2 ]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
@@ -45,9 +45,19 @@ jobs:
       # - name: Check scalafmt
       #   run: make checkformat
         
-      # - name: Compile
-      #   run: make compile
-
-      - name: Compile QuadCore
+      - name: Compile
+        run: make compile
+      
+      - name: Unit test for TileLink version
+        run: |
+            git clone https://github.com/OpenXiangShan/tl-test -b coupledL2-huancun
+            make test-top-l2l3l2
+            cd ./tl-test
+            mkdir build && cd build
+            cmake .. -DDUT_DIR=../../build -DCHISELDB=1
+            make
+            ./tlc_test -s $RANDOM
+
+      - name: Compile CHI QuadCore
         run: |
             make test-top-chi-quadcore-2ul
diff --git a/src/main/scala/coupledL2/tl2tl/MSHR.scala b/src/main/scala/coupledL2/tl2tl/MSHR.scala
index d503ba62f..b704c4f5d 100644
--- a/src/main/scala/coupledL2/tl2tl/MSHR.scala
+++ b/src/main/scala/coupledL2/tl2tl/MSHR.scala
@@ -161,6 +161,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
   }
   val mp_release, mp_probeack, mp_grant = Wire(new TaskBundle)
   val mp_release_task = {
+    mp_release := 0.U.asTypeOf(new TaskBundle)
     mp_release.channel := req.channel
     mp_release.tag := dirResult.tag
     mp_release.set := req.set
@@ -209,6 +210,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
   }
 
   val mp_probeack_task = {
+    mp_probeack := 0.U.asTypeOf(new TaskBundle)
     mp_probeack.channel := req.channel
     mp_probeack.tag := req.tag
     mp_probeack.set := req.set
@@ -278,6 +280,7 @@ class MSHR(implicit p: Parameters) extends L2Module {
     mergeA := false.B
   }
   val mp_grant_task    = {
+    mp_grant := 0.U.asTypeOf(new TaskBundle)
     mp_grant.channel := req.channel
     mp_grant.tag := req.tag
     mp_grant.set := req.set
@@ -553,6 +556,8 @@ class MSHR(implicit p: Parameters) extends L2Module {
   io.msInfo.bits.w_releaseack := state.w_releaseack
   io.msInfo.bits.w_replResp := state.w_replResp
   io.msInfo.bits.w_rprobeacklast := state.w_rprobeacklast
+  io.msInfo.bits.replaceData := mp_release.opcode === ReleaseData
+  io.msInfo.bits.metaState := meta.state
 
   assert(!(c_resp.valid && !io.status.bits.w_c_resp))
   assert(!(d_resp.valid && !io.status.bits.w_d_resp))
diff --git a/src/main/scala/coupledL2/tl2tl/MainPipe.scala b/src/main/scala/coupledL2/tl2tl/MainPipe.scala
index 610cc8c0a..5e6fb9fc4 100644
--- a/src/main/scala/coupledL2/tl2tl/MainPipe.scala
+++ b/src/main/scala/coupledL2/tl2tl/MainPipe.scala
@@ -227,6 +227,9 @@ class MainPipe(implicit p: Parameters) extends L2Module {
   ms_task.mergeA           := req_s3.mergeA
   ms_task.aMergeTask       := req_s3.aMergeTask
   ms_task.txChannel        := 0.U
+  ms_task.snpHitRelease    := false.B
+  ms_task.snpHitReleaseWithData := false.B
+  ms_task.snpHitReleaseIdx := 0.U
 
   /* ======== Resps to SinkA/B/C Reqs ======== */
   val sink_resp_s3 = WireInit(0.U.asTypeOf(Valid(new TaskBundle))) // resp for sinkA/B/C request that does not need to alloc mshr
diff --git a/src/main/scala/coupledL2/tl2tl/SinkB.scala b/src/main/scala/coupledL2/tl2tl/SinkB.scala
index 65e133a46..5c23d7d92 100644
--- a/src/main/scala/coupledL2/tl2tl/SinkB.scala
+++ b/src/main/scala/coupledL2/tl2tl/SinkB.scala
@@ -68,6 +68,9 @@ class SinkB(implicit p: Parameters) extends L2Module {
     task.replTask := false.B
     task.mergeA := false.B
     task.aMergeTask := 0.U.asTypeOf(new MergeTaskBundle)
+    task.snpHitRelease := false.B
+    task.snpHitReleaseWithData := false.B
+    task.snpHitReleaseIdx := 0.U
     task
   }
   val task = fromTLBtoTaskBundle(io.b.bits)
diff --git a/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala b/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala
index f3eb6b281..a00638699 100644
--- a/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala
+++ b/src/main/scala/coupledL2/tl2tl/TL2TLCoupledL2.scala
@@ -141,6 +141,7 @@ class TL2TLCoupledL2(implicit p: Parameters) extends CoupledL2Base {
     val prefetchTrains = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchTrain()(pftParams)))))
     val prefetchResps = prefetchOpt.map(_ => Wire(Vec(banks, DecoupledIO(new PrefetchResp()(pftParams)))))
     val prefetchReqsReady = WireInit(VecInit(Seq.fill(banks)(false.B)))
+    io.l2_tlb_req <> DontCare // TODO: l2_tlb_req should be Option
     prefetchOpt.foreach {
       _ =>
         fastArb(prefetchTrains.get, prefetcher.get.io.train, Some("prefetch_train"))
diff --git a/src/test/scala/TestTop.scala b/src/test/scala/TestTop.scala
index 71b0ff182..9900f2bc6 100644
--- a/src/test/scala/TestTop.scala
+++ b/src/test/scala/TestTop.scala
@@ -48,7 +48,9 @@ class TestTop_L2()(implicit p: Parameters) extends LazyModule {
   val l1d_nodes = (0 until 1) map( i => createClientNode(s"l1d$i", 32))
   val master_nodes = l1d_nodes
 
-  val l2 = LazyModule(new TL2TLCoupledL2())
+  val l2 = LazyModule(new TL2TLCoupledL2()(new Config((_, _, _) => {
+    case BankBitsKey => 0
+  })))
   val xbar = TLXbar()
   val ram = LazyModule(new TLRAM(AddressSet(0, 0xffffL), beatBytes = 32))
 
@@ -139,6 +141,7 @@ class TestTop_L2L3()(implicit p: Parameters) extends LazyModule {
         rrTagBits = 6
       ))
     )
+    case BankBitsKey => 0
   })))
 
   val l3 = LazyModule(new HuanCun()(new Config((_, _, _) => {
@@ -255,7 +258,9 @@ class TestTop_L2_Standalone()(implicit p: Parameters) extends LazyModule {
   val l1d_nodes = (0 until 1) map( i => createClientNode(s"l1d$i", 32))
   val master_nodes = l1d_nodes
 
-  val l2 = LazyModule(new TL2TLCoupledL2())
+  val l2 = LazyModule(new TL2TLCoupledL2()(new Config((_, _, _) => {
+    case BankBitsKey => 0
+  })))
   val xbar = TLXbar()
   val l3 = createManagerNode("Fake_L3", 16)
 
@@ -338,6 +343,7 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule {
       echoField = Seq(DirtyField()),
       hartIds = Seq{i}
     )
+    case BankBitsKey => 0
   }))))
   val l2_nodes = coupledL2.map(_.node)
 
@@ -475,6 +481,7 @@ class TestTop_fullSys()(implicit p: Parameters) extends LazyModule {
           rrTagBits = 6
         ))
       )
+      case BankBitsKey => 0
     })))
 
     l1xbar := TLBuffer() := l1i