From 6d1d1fa763b48df3f5b4e2ec213dc41f722eb251 Mon Sep 17 00:00:00 2001 From: Mike Walters Date: Sat, 20 Apr 2024 15:51:01 +0100 Subject: [PATCH 1/2] hyperram_diagnostic: use platform-specified clock domain generator This allows for use on other platforms, rather than enforcing the LunaECP5DomainGenerator --- applets/hyperram_diagnostic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/applets/hyperram_diagnostic.py b/applets/hyperram_diagnostic.py index 488bfa69..712ebe6a 100755 --- a/applets/hyperram_diagnostic.py +++ b/applets/hyperram_diagnostic.py @@ -18,7 +18,6 @@ from luna import top_level_cli from apollo_fpga import ApolloDebugger from luna.gateware.interface.jtag import JTAGRegisterInterface -from luna.gateware.architecture.car import LunaECP5DomainGenerator from luna.gateware.interface.psram import HyperRAMPHY, HyperRAMInterface REGISTER_RAM_REGISTER_SPACE = 1 @@ -38,7 +37,7 @@ def elaborate(self, platform): m = Module() # Generate our clock domains. - clocking = LunaECP5DomainGenerator() + clocking = platform.clock_domain_generator() m.submodules.clocking = clocking # Create a set of registers... From af5acbde34bf0267e5be6bde187da1d02a1b8e8b Mon Sep 17 00:00:00 2001 From: Mike Walters Date: Sat, 20 Apr 2024 15:38:23 +0100 Subject: [PATCH 2/2] gateware.interface.psram: add HyperRAM implementation using ECP5 DQS logic --- applets/hyperram_diagnostic.py | 49 ++- luna/gateware/interface/psram.py | 520 ++++++++++++++++++++++++++++++- 2 files changed, 549 insertions(+), 20 deletions(-) diff --git a/applets/hyperram_diagnostic.py b/applets/hyperram_diagnostic.py index 712ebe6a..25d4c18a 100755 --- a/applets/hyperram_diagnostic.py +++ b/applets/hyperram_diagnostic.py @@ -18,7 +18,7 @@ from luna import top_level_cli from apollo_fpga import ApolloDebugger from luna.gateware.interface.jtag import JTAGRegisterInterface -from luna.gateware.interface.psram import HyperRAMPHY, HyperRAMInterface +from luna.gateware.interface.psram import HyperRAMPHY, HyperRAMInterface, HyperRAMDQSInterface, HyperRAMDQSPHY REGISTER_RAM_REGISTER_SPACE = 1 REGISTER_RAM_ADDR = 2 @@ -26,6 +26,9 @@ REGISTER_RAM_FIFO = 4 REGISTER_RAM_START = 5 +DQS = False +REG_WIDTH = 32 if DQS else 16 +REG_SHIFT = 16 if DQS else 0 class HyperRAMDiagnostic(Elaboratable): """ @@ -36,27 +39,40 @@ class HyperRAMDiagnostic(Elaboratable): def elaborate(self, platform): m = Module() + clock_frequencies = platform.DEFAULT_CLOCK_FREQUENCIES_MHZ + + # + # HyperRAM test connections. + # + if DQS: + clock_frequencies = { + "fast": 120, + "sync": 60, + "usb": 60, + } + ram_bus = platform.request('ram', dir={'rwds':'-', 'dq':'-', 'cs':'-'}) + psram_phy = HyperRAMDQSPHY(bus=ram_bus) + psram = HyperRAMDQSInterface(phy=psram_phy.phy) + else: + ram_bus = platform.request('ram') + psram_phy = HyperRAMPHY(bus=ram_bus) + psram = HyperRAMInterface(phy=psram_phy.phy) + + m.submodules += [psram_phy, psram] + # Generate our clock domains. - clocking = platform.clock_domain_generator() + clocking = platform.clock_domain_generator(clock_frequencies=clock_frequencies) m.submodules.clocking = clocking # Create a set of registers... registers = JTAGRegisterInterface(address_size=7, default_read_value=0xDEADBEEF) m.submodules.registers = registers - # - # HyperRAM test connections. - # - ram_bus = platform.request('ram') - psram_phy = HyperRAMPHY(bus=ram_bus) - psram = HyperRAMInterface(phy=psram_phy.phy) - m.submodules += [psram_phy, psram] - psram_address = registers.add_register(REGISTER_RAM_ADDR) read_length = registers.add_register(REGISTER_RAM_READ_LENGTH, reset=1) - m.submodules.read_fifo = read_fifo = SyncFIFO(width=16, depth=32) - m.submodules.write_fifo = write_fifo = SyncFIFO(width=16, depth=32) + m.submodules.read_fifo = read_fifo = SyncFIFO(width=REG_WIDTH, depth=32) + m.submodules.write_fifo = write_fifo = SyncFIFO(width=REG_WIDTH, depth=32) registers.add_sfr(REGISTER_RAM_FIFO, read=read_fifo.r_data, read_strobe=read_fifo.r_en, @@ -122,7 +138,10 @@ def elaborate(self, platform): dut = ApolloDebugger() logging.info(f"Connected to onboard dut; hardware revision r{dut.major}.{dut.minor} (s/n: {dut.serial_number}).") - logging.info("Running basic HyperRAM diagnostics.") + if DQS: + logging.info("Running basic HyperRAM diagnostics, using DQS implementation.") + else: + logging.info("Running basic HyperRAM diagnostics.") iterations = 1 @@ -135,7 +154,7 @@ def read_hyperram_register(addr): dut.registers.register_write(REGISTER_RAM_ADDR, addr) dut.registers.register_read(REGISTER_RAM_START) time.sleep(0.1) - return dut.registers.register_read(REGISTER_RAM_FIFO) + return dut.registers.register_read(REGISTER_RAM_FIFO) >> REG_SHIFT def test_id_read(): return read_hyperram_register(0x0) in (0x0c81, 0x0c86) @@ -146,7 +165,7 @@ def test_config_read(): def test_mem_readback(): dut.registers.register_write(REGISTER_RAM_REGISTER_SPACE, 0) - data = [random.randint(0, int(2**16)) for _ in range(10)] + data = [random.randint(0, int(2**REG_WIDTH)) for _ in range(10)] # Fill write FIFO. for d in data: diff --git a/luna/gateware/interface/psram.py b/luna/gateware/interface/psram.py index af31f80a..1b7e4e7c 100644 --- a/luna/gateware/interface/psram.py +++ b/luna/gateware/interface/psram.py @@ -37,7 +37,6 @@ class HyperRAMInterface(Elaboratable): I/O port: B: phy -- The primary physical connection to the DRAM chip. - I: reset -- An active-high signal used to provide a prolonged reset upon configuration. I: address[32] -- The address to be targeted by the given operation. I: register_space -- When set to 1, read and write requests target registers instead of normal RAM. @@ -68,7 +67,6 @@ def __init__(self, *, phy): # I/O port. # self.phy = phy - self.reset = Signal() # Control signals. self.address = Signal(32) @@ -87,8 +85,6 @@ def __init__(self, *, phy): self.read_data = Signal(16) self.write_data = Signal(16) - self.clk = Signal() - def elaborate(self, platform): m = Module() @@ -307,7 +303,7 @@ def elaborate(self, platform): class HyperRAMPHY(Elaboratable): - """ Gateware interface to HyperRAM series self-refreshing DRAM chips. + """ Gateware PHY for HyperRAM series self-refreshing DRAM chips. I/O port: B: bus -- The primary physical connection to the DRAM chip. @@ -405,3 +401,517 @@ def elaborate(self, platform): ] return m + + +class HyperBusDQSPHY(Record): + """ Record representing a 32-bit HyperBus interface on a DQS group for use with a 4:1 PHY module. """ + + def __init__(self): + super().__init__([ + ('clk_en', 2, DIR_FANOUT), + ('dq', [ + ('i', 32, DIR_FANIN), + ('o', 32, DIR_FANOUT), + ('e', 1, DIR_FANOUT), + ]), + ('rwds', [ + ('i', 4, DIR_FANIN), + ('o', 4, DIR_FANOUT), + ('e', 1, DIR_FANOUT), + ]), + ('cs', 1, DIR_FANOUT), + ('reset', 1, DIR_FANOUT), + ('read', 2, DIR_FANIN), + ('datavalid', 1, DIR_FANOUT), + ('burstdet', 1, DIR_FANOUT) + ]) + + + +class HyperRAMDQSInterface(Elaboratable): + """ Gateware interface to HyperRAM series self-refreshing DRAM chips, using ECP5 DQS logic. + + I/O port: + B: phy -- The primary physical connection to the DRAM chip. + + I: address[32] -- The address to be targeted by the given operation. + I: register_space -- When set to 1, read and write requests target registers instead of normal RAM. + I: perform_write -- When set to 1, a transfer request is viewed as a write, rather than a read. + I: single_page -- If set, data accesses will wrap around to the start of the current page when done. + I: start_transfer -- Strobe that goes high for 1-8 cycles to request a read operation. + [This added duration allows other clock domains to easily perform requests.] + I: final_word -- Flag that indicates the current word is the last word of the transaction. + + O: read_data[32] -- word that holds the 32 bits most recently read from the PSRAM + I: write_data[32] -- word that accepts the data to output during this transaction + + O: idle -- High whenever the transmitter is idle (and thus we can start a new piece of data.) + O: read_ready -- Strobe that indicates when new data is ready for reading + O: write_ready -- Strobe that indicates `write_data` has been latched and is ready for new data + """ + + LOW_LATENCY_CLOCKS = 3 + HIGH_LATENCY_CLOCKS = 5 + + def __init__(self, *, phy): + """ + Parmeters: + phy -- The RAM record that should be connected to this RAM chip. + """ + + # + # I/O port. + # + self.phy = phy + + # Control signals. + self.address = Signal(32) + self.register_space = Signal() + self.perform_write = Signal() + self.single_page = Signal() + self.start_transfer = Signal() + self.final_word = Signal() + + # Status signals. + self.idle = Signal() + self.read_ready = Signal() + self.write_ready = Signal() + + # Data signals. + self.read_data = Signal(32) + self.write_data = Signal(32) + + + def elaborate(self, platform): + m = Module() + + # + # Latched control/addressing signals. + # + is_read = Signal() + is_register = Signal() + current_address = Signal(32) + is_multipage = Signal() + + # + # FSM datapath signals. + # + + # Tracks whether we need to add an extra latency period between our + # command and the data body. + extra_latency = Signal() + + # Tracks how many cycles of latency we have remaining between a command + # and the relevant data stages. + latency_clocks_remaining = Signal(range(0, self.HIGH_LATENCY_CLOCKS + 1)) + + # + # Core operation FSM. + # + + # Provide defaults for our control/status signals. + m.d.sync += [ + self.phy.clk_en .eq(0b11), + self.phy.cs .eq(1), + self.phy.rwds.e .eq(0), + self.phy.dq.e .eq(0), + self.phy.read .eq(0), + ] + m.d.comb += self.write_ready.eq(0), + + # Commands, in order of bytes sent: + # - WRBAAAAA + # W => selects read or write; 1 = read, 0 = write + # R => selects register or memory; 1 = register, 0 = memory + # B => selects burst behavior; 0 = wrapped, 1 = linear + # AAAAA => address bits [27:32] + # + # - AAAAAAAA => address bits [19:27] + # - AAAAAAAA => address bits [11:19] + # - AAAAAAAA => address bits [ 3:16] + # - 00000000 => [reserved] + # - 00000AAA => address bits [ 0: 3] + ca = Signal(48) + m.d.comb += ca.eq(Cat( + current_address[0:3], + Const(0, 13), + current_address[3:32], + is_multipage, + is_register, + is_read + )) + + with m.FSM() as fsm: + + # IDLE state: waits for a transaction request + with m.State('IDLE'): + m.d.comb += self.idle .eq(1) + m.d.sync += self.phy.clk_en .eq(0) + + # Once we have a transaction request, latch in our control + # signals, and assert our chip-select. + with m.If(self.start_transfer): + m.next = 'LATCH_RWDS' + + m.d.sync += [ + is_read .eq(~self.perform_write), + is_register .eq(self.register_space), + is_multipage .eq(~self.single_page), + current_address .eq(self.address), + self.phy.dq.o .eq(0), + ] + + with m.Else(): + m.d.sync += self.phy.cs.eq(0) + + + # LATCH_RWDS -- latch in the value of the RWDS signal, + # which determines our read/write latency. + with m.State("LATCH_RWDS"): + m.d.sync += extra_latency.eq(self.phy.rwds.i), + m.d.sync += self.phy.clk_en.eq(0b11) + m.next="SHIFT_COMMAND0" + + + # SHIFT_COMMANDx -- shift each of our command words out + with m.State('SHIFT_COMMAND0'): + # Output the first 32 bits of our command. + m.d.sync += [ + self.phy.dq.o.eq(Cat(ca[16:48])), + self.phy.dq.e.eq(1), + ] + m.next = 'SHIFT_COMMAND1' + + with m.State('SHIFT_COMMAND1'): + # Output the remaining 32 bits of our command. + m.d.sync += [ + self.phy.dq.o.eq(Cat(Const(0, 16), ca[0:16])), + self.phy.dq.e.eq(1), + ] + + # If we have a register write, we don't need to handle + # any latency. Move directly to our SHIFT_DATA state. + with m.If(is_register & ~is_read): + m.next = 'WRITE_DATA' + + # Otherwise, react with either a short period of latency + # or a longer one, depending on what the RAM requested via + # RWDS. + with m.Else(): + m.next = "HANDLE_LATENCY" + + # FIXME: our HyperRAM part has a fixed latency, but we could need to detect + # different variants from the configuration register in the future. + with m.If(extra_latency | 1): + m.d.sync += latency_clocks_remaining.eq(self.HIGH_LATENCY_CLOCKS) + with m.Else(): + m.d.sync += latency_clocks_remaining.eq(self.LOW_LATENCY_CLOCKS) + + + # HANDLE_LATENCY -- applies clock cycles until our latency period is over. + with m.State('HANDLE_LATENCY'): + m.d.sync += latency_clocks_remaining.eq(latency_clocks_remaining - 1) + + with m.If(latency_clocks_remaining == 0): + with m.If(is_read): + m.next = 'READ_DATA' + with m.Else(): + m.next = 'WRITE_DATA' + + + # READ_DATA -- reads words from the PSRAM + with m.State('READ_DATA'): + m.d.sync += self.phy.read.eq(0b11) + + datavalid_delay = Signal() + m.d.sync += datavalid_delay.eq(self.phy.datavalid) + + with m.If(self.phy.datavalid): + m.d.comb += [ + self.read_data .eq(self.phy.dq.i), + self.read_ready .eq(1), + ] + + # If our controller is done with the transaction, end it. + with m.If(self.final_word): + m.d.sync += self.phy.clk_en.eq(0), + m.next = 'RECOVERY' + + # WRITE_DATA -- write a word to the PSRAM + with m.State("WRITE_DATA"): + m.d.sync += [ + self.phy.dq.o .eq(self.write_data), + self.phy.dq.e .eq(1), + self.phy.rwds.e .eq(~is_register), + self.phy.rwds.o .eq(0), + ] + m.d.comb += self.write_ready.eq(1), + + # If we just finished a register write, we're done -- there's no need for recovery. + with m.If(is_register): + m.next = 'IDLE' + + with m.Elif(self.final_word): + m.next = 'RECOVERY' + + + # RECOVERY state: wait for the required period of time before a new transaction + with m.State('RECOVERY'): + m.d.sync += self.phy.clk_en .eq(0) + + # TODO: implement recovery + m.next = 'IDLE' + + + + return m + + +class HyperRAMDQSPHY(Elaboratable): + """ Gateware PHY for HyperRAM series self-refreshing DRAM chips, using ECP5 DQS logic. + + I/O port: + B: bus -- The primary physical connection to the DRAM chip. + """ + + def __init__(self, *, bus, in_skew=None, out_skew=None, clock_skew=None): + self.bus = bus + self.phy = HyperBusDQSPHY() + + def elaborate(self, platform): + m = Module() + + # Handle initial DDRDLL lock & delay code update + pause = Signal() + freeze = Signal() + lock = Signal() + uddcntln = Signal() + counter = Signal(range(9)) + m.d.sync += counter.eq(counter + 1) + with m.FSM() as fsm: + with m.State('INIT'): + m.d.sync += [ + pause.eq(1), + freeze.eq(0), + uddcntln.eq(0), + ] + + with m.If(lock): + m.next = 'FREEZE' + m.d.sync += [ + freeze.eq(1), + counter.eq(0), + ] + + with m.State('FREEZE'): + with m.If(counter == 8): + m.next = 'UPDATE' + m.d.sync += [ + uddcntln.eq(1), + counter.eq(0), + ] + + with m.State('UPDATE'): + with m.If(counter == 8): + m.next = 'UPDATED' + m.d.sync += [ + uddcntln.eq(0), + counter.eq(0), + ] + + with m.State('UPDATED'): + with m.If(counter == 8): + m.next = 'UNPAUSE' + m.d.sync += [ + pause.eq(0), + counter.eq(0), + ] + + with m.State('UNPAUSE'): + pass + + + # DQS (RWDS) input + rwds_o = Signal() + rwds_oe_n = Signal() + rwds_in = Signal() + + dqsr90 = Signal() + dqsw = Signal() + dqsw270 = Signal() + ddrdel = Signal() + readptr = Signal(3) + writeptr = Signal(3) + m.submodules += [ + Instance("DDRDLLA", + i_CLK=ClockSignal("fast"), + i_RST=ResetSignal(), + i_FREEZE=freeze, + i_UDDCNTLN=uddcntln, + o_DDRDEL=ddrdel, + o_LOCK=lock, + ), + Instance("BB", + i_I=rwds_o, + i_T=rwds_oe_n, + o_O=rwds_in, + io_B=self.bus.rwds.io + ), + Instance("TSHX2DQSA", + i_RST=ResetSignal(), + i_ECLK=ClockSignal("fast"), + i_SCLK=ClockSignal(), + i_DQSW=dqsw, + i_T0=~self.phy.rwds.e, + i_T1=~self.phy.rwds.e, + o_Q=rwds_oe_n + ), + Instance("DQSBUFM", + i_SCLK=ClockSignal(), + i_ECLK=ClockSignal("fast"), + i_RST=ResetSignal(), + + i_DQSI=rwds_in, + i_DDRDEL=ddrdel, + i_PAUSE=pause, + i_READ0=self.phy.read[0], + i_READ1=self.phy.read[1], + # TODO: may need to tune at runtime by trying different values & checking for BURSTDET high + i_READCLKSEL0=0, + i_READCLKSEL1=1, + i_READCLKSEL2=0, + + i_RDLOADN=0, + i_RDMOVE=0, + i_RDDIRECTION=1, + i_WRLOADN=0, + i_WRMOVE=0, + i_WRDIRECTION=1, + + o_DQSR90=dqsr90, + o_DQSW=dqsw, + o_DQSW270=dqsw270, + **{f"o_RDPNTR{i}": readptr[i] for i in range(len(readptr))}, + **{f"o_WRPNTR{i}": writeptr[i] for i in range(len(writeptr))}, + + o_DATAVALID=self.phy.datavalid, + o_BURSTDET=self.phy.burstdet, + ), + ] + + # Clock + clk_out = Signal() + clk_dqsw270 = Signal() + m.submodules += [ + Instance("DELAYG", + p_DEL_MODE="DQS_CMD_CLK", + i_A=clk_out, + o_Z=self.bus.clk, + ), + Instance("ODDRX2F", + i_D0=0, + i_D1=self.phy.clk_en[1], + i_D2=0, + i_D3=self.phy.clk_en[0], + i_SCLK=ClockSignal(), + i_ECLK=ClockSignal("fast"), + i_RST=ResetSignal(), + o_Q=clk_out, + ), + ] + + # CS + cs_out = Signal() + m.submodules += [ + Instance("DELAYG", + p_DEL_MODE="DQS_CMD_CLK", + i_A=cs_out, + o_Z=self.bus.cs, + ), + Instance("ODDRX2F", + i_D0=~self.phy.cs, + i_D1=~self.phy.cs, + i_D2=~self.phy.cs, + i_D3=~self.phy.cs, + i_SCLK=ClockSignal(), + i_ECLK=ClockSignal("fast"), + i_RST=ResetSignal(), + o_Q=cs_out, + ), + ] + + # RWDS out + m.submodules += [ + Instance("ODDRX2DQSB", + i_DQSW=dqsw, + i_D0=self.phy.rwds.o[3], + i_D1=self.phy.rwds.o[2], + i_D2=self.phy.rwds.o[1], + i_D3=self.phy.rwds.o[0], + i_SCLK=ClockSignal(), + i_ECLK=ClockSignal("fast"), + i_RST=ResetSignal(), + o_Q=rwds_o, + ), + ] + + # DQ + for i in range(8): + dq_in = Signal(name=f"dq_in{i}") + dq_in_delayed = Signal(name=f"dq_in_delayed{i}") + dq_oe_n = Signal(name=f"dq_oe_n{i}") + dq_o = Signal(name=f"dq_o{i}") + # Out + m.submodules += [ + # Tristate + Instance("BB", + i_I=dq_o, + i_T=dq_oe_n, + o_O=dq_in, + io_B=self.bus.dq.io[i] + ), + Instance("TSHX2DQA", + i_T0=~self.phy.dq.e, + i_T1=~self.phy.dq.e, + i_SCLK=ClockSignal(), + i_ECLK=ClockSignal("fast"), + i_DQSW270=dqsw270, + i_RST=ResetSignal(), + o_Q=dq_oe_n, + ), + + # Output + Instance("ODDRX2DQA", + i_DQSW270=dqsw270, + i_D0=self.phy.dq.o[i+24], + i_D1=self.phy.dq.o[i+16], + i_D2=self.phy.dq.o[i+8], + i_D3=self.phy.dq.o[i], + i_SCLK=ClockSignal(), + i_ECLK=ClockSignal("fast"), + i_RST=ResetSignal(), + o_Q=dq_o, + ), + + # Input + Instance("DELAYG", + p_DEL_MODE="DQS_ALIGNED_X2", + i_A=dq_in, + o_Z=dq_in_delayed, + ), + Instance("IDDRX2DQA", + i_D=dq_in_delayed, + i_DQSR90=dqsr90, + i_SCLK=ClockSignal(), + i_ECLK=ClockSignal("fast"), + i_RST=ResetSignal(), + **{f"i_RDPNTR{i}": readptr[i] for i in range(len(readptr))}, + **{f"i_WRPNTR{i}": writeptr[i] for i in range(len(writeptr))}, + o_Q0=self.phy.dq.i[i+24], + o_Q1=self.phy.dq.i[i+16], + o_Q2=self.phy.dq.i[i+8], + o_Q3=self.phy.dq.i[i], + ), + ] + + return m