diff --git a/lectures/crash-course-day1.tex b/lectures/crash-course-day1.tex new file mode 100644 index 0000000..88acf74 --- /dev/null +++ b/lectures/crash-course-day1.tex @@ -0,0 +1,694 @@ +\input{../common/config} + +\title{Simulation Role in Software and Hardware Development, Basic Concepts} + +\begin{document} + +\startslides + +\section{Complexity of Modern Computer Systems} + +\begin{frame}{Complexity of Modern Computer Systems} + +\centering +\includegraphics[width=0.7\textwidth]{ic-floor} + +\tiny{Source: P. Horowitz and W. Hill. 1989. The Art of Electronics. Cambridge +University Press, New York, NY, USA} + +\end{frame} + +\begin{frame}{Why Software Development Only On Real Hardware Is Not Beneficial?} + +\begin{itemize} +\item Amount of available samples is usually limited, +\item low-level debug is challenging, +\item long development cycle. +\end{itemize} + +$\Rightarrow$ development cost is increasing. + +\bigskip + +``\tiny{I've noticed a shift during the past couple of years towards an increasing +use of various types of simulation, including virtual platforms. Previously +software developers wanted real hardware, but now they have to start using +simulation because there's no chip available.`` +\textit{Tomas Evensen, Wind River CTO}} + +\end{frame} + +\begin{frame}{Solution --- Software Models of Real Hardware} +\centering +\inputpicture{idea} + +\end{frame} + +\section{Areas of Application} + +\begin{frame}{Areas of Application} +\begin{itemize} +\item New hardware development, +\item software and hardware co-development, +\item experimental architectures, +\item power and performance prediction, +\item compatibility with other architecures. +\end{itemize} + +\end{frame} + +\begin{frame}{New Hardware Development} + +\inputpicture{error-cost} + +\end{frame} + +\begin{frame}{Software and Hardware Development} +\begin{itemize} +\item Firmware, BIOS, UEFI. +\item Operation systems. +\item Device drivers. +\item Compilers. +\item Applications. +\end{itemize} + +\end{frame} + +\begin{frame}{``Shift Left`` --- Accelerated Product Development} + +\centering + +\includegraphics[width=1\textwidth]{shift-left} % TODO TikZ-elize this + +\begin{tiny} +Impact of Shift left on hardware/software development. +Source: Semiconductor Engineering +\end{tiny} + +\vfill + +\begin{itemize} +\item Allow early software development --- before silicon arrives. +\item Shorten time to marked by overlapping hardware and software designs. +\item Decouple software and hardware development. +\item Validation of software, hardware, and their integration starts earlier. +\end{itemize} + +\end{frame} + +\begin{frame}{Experimental Architectures} + +\begin{itemize} +\item New Instruction Set Architectures (ISA). +\item New ISA extensions. +\item Multicore systems. +\item Vector systems. +\item Security and Cryptography. +\end{itemize} + +\end{frame} + +\begin{frame}{Power and Performance Prediction} +% TODO: explain a bit every item. +\begin{itemize} +\item Untimed, +\item Loosely Timed, +\item Approximately timed. +\end{itemize} + +\end{frame} + +\begin{frame}{Compatibility with Other Architectures} +\includegraphics[width=\textwidth]{compat} % TODO TikZ-elize this +\end{frame} + +\section{Terminology} + +\begin{frame}{Terminology} +\begin{itemize} +\item \textbf{Simulation} --- replication of system's behavior that can be + observed through \textbf{\textit{external}} interaction with the system. +\item \textbf{Emulation} --- replication of a system's behavior considering how + the system \textbf{\textit{internally}} works through imitation of all + internal structures and processes. +\item \textbf{Virtualization} --- effective isolation of several systems from + each other with simultaneous and transparent access to resources of the + underlying system. +\end{itemize} + +\end{frame} + +\begin{frame}{Types of Simulators} +\begin{itemize} +\item Full-platform, +\item Application level, +\item Functional, +\item Cycle-accurate, +\item Software, +\item Hybrid. +\end{itemize} +\end{frame} + +\section{Capabilities} + +\begin{frame}{Some Simulation Capabilities} +\begin{itemize} +\item Non-intrusive inspection, +\item Repeatability, +\item Save/restore of simulated state, +\item Synchronized system stop, +\item Reverse execution. +\end{itemize} +\end{frame} + +\section{Simulated System} +\begin{frame}{Simulated System} +\centering +% TODO: add a link from APIC to addr-decoder. +% Not important for this presenation but the link exists. +\begin{tikzpicture}[>=latex, font=\small, node distance = 0.5cm] + +\begin{scope}[minimum height=0.8cm] + \node[draw, ] (cpu) {CPU1}; + \node[draw, below=of cpu] (mmu) {MMU1}; + + \node[draw, left=of cpu] (cpu2) {CPU2}; + \node[draw, below=of cpu2] (mmu2) {MMU2}; + \node[draw, right=2cm of cpu, ] (pic) {APIC}; + + \coordinate[above=of pic] (op); + \coordinate (mp) at (barycentric cs:mmu=0.5,mmu2=0.5); + \node[draw, below=1cm of mp] (addr-decoder) {addr-decoder}; + + \node[draw, below=2cm of mmu, text width=4.5cm, align = center, ] (dram) {RAM}; + \node[draw, right=of pic, ] (pit) {PIT}; +\end{scope} + +\draw[<->] (cpu) -- (cpu |- mmu.north); +\draw[<->] (cpu2) -- (cpu2 |- mmu2.north); + +\draw[<->] (mmu) -- (mmu |- addr-decoder.north); +\draw[<->] (mmu2) -- (mmu2 |- addr-decoder.north); + +\draw[<->] (addr-decoder) -| (pit); +\draw[<->] (addr-decoder) -- (addr-decoder |- dram.north); + +\draw[->, ] (pic) -- (cpu); +\draw[->, ] (pit) -- (pic); +\draw[->, ] (pic) -- (op) -| (cpu2); +\end{tikzpicture} +\end{frame} + +\section{Timer} + +\begin{frame}{Example \No1: Timer} +\centering +\begin{tikzpicture}[>=latex] +\coordinate (center) at (0,0); +\node[draw, text width = 2cm, above = 0.5 cmof center] (reference) {\texttt{reference}}; +\node[draw, text width = 2cm, below = 0.5cm of center] (counter) {\texttt{counter}}; +\node[draw, text width = 0.4cm, right = 2cm of center, shape = isosceles triangle, inner sep=1dd] (comparator) {=?}; +\node[right = of comparator] (int) {\#INT}; +\node[above = of reference] (reset) {\#RESET}; +\node[below = of counter] (enable) {\#ENABLE}; +\node[left = 1.5cm of reference] (ref-input) {REF}; +\node[left = 0.25cm of counter.north west] (clk) {\small{CLK}}; + +% draw a quartz +\coordinate (quartz) at ([xshift = -2cm]counter.west); + \node[] at (quartz) {\small{F}}; +\draw (quartz) ++(-0.25,0.25) rectangle ++(0.5,-0.5); +\draw (quartz) ++(-0.25,0.35) -- ++ (0.5,0); +\draw (quartz) ++(-0.25,-0.35) -- ++ (0.5,0); +\draw (reset) -- (reference); +\draw (enable) -- (counter); + +% draw wires +\draw (reference.east) -| ([xshift = -0.2cm]comparator.160) -- (comparator.160); +\draw (counter.east) -| ([xshift = -0.2cm]comparator.200) -- (comparator.200); +\draw (ref-input) -- (reference) node[midway] {\tiny{/}} node[midway, above] {16}; +\draw (quartz) ++(0.25,0) -- (counter); +\draw[->] (comparator) -- (int); +\node[draw, dashed, fit = (reference) (counter) (comparator)] {}; +\end{tikzpicture} +\end{frame} + +\begin{frame}{Timing Diagram} +\centering +\begin{tikzpicture}[>=latex] +% set clock and #INT time lines +\draw[->] (0,0) -- (10,0) node[pos=0.95, below] {CLK}; +\draw[->] (0,1) -- (10,1) node[pos=0.0, above] {\#INT}; + +\foreach \x in { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19} { + \draw (\x/2,-0.15) -- (\x/2,0.15) {}; + \coordinate (tick\x) at (\x/2, 1); +}; + +\draw[fill=black] (tick3) circle (0.1cm); +\node[below = 0.15cm of tick3] (event-enable) {\tiny{ENABLE=1}}; + +\draw[fill=black] (tick10) circle (0.1cm); +\draw[fill=black] (tick17) circle (0.1cm); +\draw (tick3) -- ++(0, 1); +\draw (tick10) -- ++(0, 1); +\draw (tick17) -- ++(0, 1); +\draw[<->] ([yshift=0.8cm]tick3) -- ([yshift=0.8cm]tick10) node[midway, above] {reference}; +\draw[<->] ([yshift=0.8cm]tick10) -- ([yshift=0.8cm]tick17) node[midway, above] {reference}; + +% The actual #CLK plot +\draw[thick] (tick1) -- (tick10) -- ++(0, 0.5) -- ++(0.5, 0) -- + (tick11) -- (tick17) -- ++(0, 0.5) -- ++(0.5, 0) -- (tick18) -- (tick19); + +\end{tikzpicture} +\end{frame} + +\begin{frame}[fragile]{Simulation With a Fixed Step Size} +\begin{lstlisting} +on_clk() { + if (enable) counter +=1; + if (counter == reference) { + raise_int(); + counter = 0; + } else { + lower_int(); + } +} + +on_reset() { + reference = 0; + counter = 0; + enable = 0; +} +\end{lstlisting} +\end{frame} + +\begin{frame}{Typical Timer Characteristics} +\begin{itemize} + \item $\mathsf{F} \approx 10$ MHz, + \item $\mathsf{reference} > 10^3$, + \item \#RESET --- no more than one per $\approx 100$ seconds. +\end{itemize} +\vfill +$\Rightarrow$ externally visible effect (\#INT) occurs approximately once per +$10^3$ cycles. +\end{frame} + +\begin{frame}{Optimization} +No modeling for externally invisible actions. +\vfill +\centering +\begin{tikzpicture}[>=latex] +% set clock and #INT time lines +\draw[->] (0,1) -- (10,1) node[pos=0.0, above] {\#INT} node[pos=0.95, below] {t\textsubscript{sim}}; + +\foreach \x in { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19} { + \coordinate (tick\x) at (\x/2, 1); +}; + +\draw[fill=black] (tick3) circle (0.1cm); + +\draw[fill=black] (tick10) circle (0.1cm); +\draw[fill=black] (tick17) circle (0.1cm); +\draw (tick3) -- ++(0, 1); +\draw (tick10) -- ++(0, 1); +\draw (tick17) -- ++(0, 1); +\draw[<->] ([yshift=0.8cm]tick3) -- ([yshift=0.8cm]tick10) node[midway, above] {reference}; +\draw[<->] ([yshift=0.8cm]tick10) -- ([yshift=0.8cm]tick17) node[midway, above] {reference}; + +% The actual #CLK plot +\draw[thick] (tick1) -- (tick10) -- ++(0, 0.5) -- ++(0, -0.5) -- + (tick11) -- (tick17) -- ++(0, 0.5) -- ++(0, -0.5) -- (tick18) -- (tick19); +\end{tikzpicture} +\end{frame} + +\begin{frame}[fragile]{Discrete Event Simulation} +\begin{lstlisting} +typedef struct event { + time_t delta; + dev_t *device; + (*function)(dev_t *device); +} event_t; + +event_t *event_queue; +time_t sim_time = 0; +for (event_t *e; e != NULL; + e = next_event(&event_queue)) { + e->function(e->device); + sim_time += e->delta; +} +\end{lstlisting} +\end{frame} + +\section{Delayed Response} + +\begin{frame}{Example \No2: Waiting for a Response} +\begin{center} +\begin{tikzpicture}[>=latex] +\node[draw, inner ysep=1cm] (dev1) {\texttt{dev1}}; +\node[draw, inner ysep=1cm, right= 3cm of dev1] (dev2) {\texttt{dev2}}; + +\draw[->] (dev1.55) -- (dev2.125) node[midway] {\tiny{/}} node[midway, above] {address}; + +\draw[->] (dev2) -- (dev1) node[midway] {\tiny{/}} node[midway, above] {data}; + +\draw[->] (dev2.240) -- (dev1.300) node[midway, above] {\#RDY}; +\end{tikzpicture} +\end{center} + +\begin{enumerate} +\item Request from \texttt{dev1}: \texttt{address}. +\item \texttt{dev2} calculates \texttt{data}. +\item \texttt{dev2} notifies \texttt{dev1} about data readiness + \textit{after some time} $\Delta T$ by \#RDY. +\item \texttt{dev1} works independently from \texttt{address} request to \#RDY + response. +\end{enumerate} +\end{frame} + +\begin{frame}{Implementation} +\texttt{ + dev1: + \begin{enumerate} + \item dev2.read(address); + \end{enumerate} + dev2: + \begin{enumerate} + \item data = get_data(address); + \item event_queue.post($\Delta T$, dev1, rdy()); + \end{enumerate} + dev1: + \begin{enumerate} + \item rdy() { read(data); } + \end{enumerate} +} +\end{frame} + +\section{Theory} + +\begin{frame}{Event Queue} + \centering + \inputpicture{des} +\end{frame} + +\begin{frame}{Event Content and Results} +An event contains: +\begin{itemize} +\item time stamp ($\Delta T$ or absolute time), +\item a function to be called, +\item an object whose state is to be changed. +\end{itemize} + +Event handling results: +\begin{itemize} +\item changes to state of the simulated system, +\item added or destroyed events. +\end{itemize} +\end{frame} + +\begin{frame}{Questions} +What should happen to the event queue when: +\begin{enumerate} + \item \texttt{reference} written to?\pause + \item \#RESET happens?\pause + \item timer is disabled (ENABLE $\leftarrow$ 0)?\pause + \item \texttt{counter} is read? +\end{enumerate} +\end{frame} + +\begin{frame}[fragile]{Discrete Event Simulation Algorithm} +\begin{lstlisting} +typedef struct event event_t; + +struct event { + time_t delta; + dev_t *device; + (*function)(dev_t *device, event_t *queue); +}; + +event_t *event_queue; +time_t sim_time = 0; +while (!empty(&event_queue)) { + sim_time += get_delta(&event_queue); + evt_t *evt = pop_event(&event_queue); + evt->function(evt->device, &event_queue); +} +\end{lstlisting} +\end{frame} + +\begin{frame}{Event Properties} +\begin{itemize} +\item New event cannot be created in the past. +\item Event handling can create new events. +\item Event handling can cancel future (not yet handled) events. +\item Several events may have the same time stamp. +\end{itemize} +\end{frame} + +\begin{frame}[fragile]{Simics\reg~QSP example} +\emph{Demo: qsp-clear-linux.simics} +\footnotesize{\begin{verbatim} +simics> peq ++--------------+------------------------+----------------------------+ +| Cycle | Object | Description | ++--------------+------------------------+----------------------------+ +| 51367|board.mb.sb.hpet |tim_event | +| 1174759|board.mb.sb.uhci[0] |frame_update | +| 1174759|board.mb.sb.uhci[1] |frame_update | +| 1174759|board.mb.sb.uhci[2] |frame_update | +| 1174759|board.mb.sb.uhci[3] |frame_update | +| 1174759|board.mb.sb.uhci[4] |frame_update | +| 1174759|board.mb.sb.uhci[5] |frame_update | +| 1011284267|board.mb.cpu0.core[0][0]|performance counter overflow| +| 8470303804|board.mb.sb.lpc |pm1_ovf | +|37955235174759|board.mb.sb.rtc |rtc.rtc_timer | ++--------------+------------------------+----------------------------+ +\end{verbatim}} +\end{frame} + +\section{Co-simulation} + +\begin{frame}{Simulation Techniques We Know} +\begin{itemize} + \item Discrete event simulation: timer (non-executing device). + \item{<> models: memory (instant).} + \item Interpretation, binary translation, direct execution: + processors (executing devices), will review on the next lecture. +\end{itemize} +\end{frame} + +\begin{frame}{Simulation Using DES and Executing Models} +\centering +\begin{tikzpicture}[>=latex, font=\scriptsize] + \draw[->] (-0.5,0) -- (10.5,0) node[pos=1, above, align=center] (sim-time) {Simulated\\Time}; + + \begin{scope} + \clip (0,-2) rectangle (10, 2.5); + \foreach \x in { 1, 2, 3, 4, 5, 6, 7, 8, 9} { + \draw (\x,-0.15) -- (\x,0.15) node (tick\x) {}; + }; + + \node[shape=dart, draw, shape border rotate=270 ] at (1, 0.5) (event1) {}; + \node[shape=dart, draw, shape border rotate=270 ] at (5, 0.5) (event2) {}; + \node[shape=dart, draw, shape border rotate=270 ] at (9, 0.5) (event3) {}; + + \node[above of=event2] (deslabel) {Discrete Events}; + \draw[->] (deslabel) -- (event1); + \draw[->] (deslabel) -- (event2); + \draw[->] (deslabel) -- (event3); + + \draw (3,-0.5) ellipse[x radius = 2cm, y radius = 0.5cm] node {Processor Simulation}; + \draw (7,-0.5) ellipse[x radius = 2cm, y radius = 0.5cm] node {Processor Simulation}; + + \draw (-1,-0.5) ellipse[x radius = 2cm, y radius = 0.5cm] node {} ; + \draw (11,-0.5) ellipse[x radius = 2cm, y radius = 0.5cm] node {} ; + \end{scope} +\end{tikzpicture} +\end{frame} + +\begin{frame}{Co-Simulation} +\centering +\begin{tikzpicture}[>=latex] + \node[draw, circle, text width = 3cm, text badly centered] (dessim) {Discrete event simulator}; + \node[draw, circle, text width = 3cm, text badly centered, right = 2.5cm of dessim] (execsim) {Executing model simulator}; + + \draw (dessim.45) edge[bend left = 45, ->] (execsim.135); + \node[above=1cm of execsim.135] {\small Timer to the next event}; + \draw (execsim.225) edge[->, bend left = 45] (dessim.315); + \node[below=1cm of dessim.315] {\small Number of steps/cycles}; +\end{tikzpicture} +\end{frame} + +\section{Multi-Processor Simulation} + +\begin{frame}{Simulation of a Multi-Processor System} +\centering +\begin{tikzpicture}[>=latex] + \node[draw, circle] (core1) {Core 1}; + \node[draw, circle, right = of core1] (core2) {Core 2}; + \node[draw, circle, right = of core2] (core3) {Core 3}; + \node[right = of core3] (dots) {\dots}; + \node[draw, circle, right = of dots] (coren) {Core $N$}; + + \coordinate[below = 2.3cm of core1] (c3); + \coordinate[below = 1.5cm of coren] (c4); + + \node[draw, fit = (c3) (c4), inner ysep=1pt] (shmem) {Shared memory}; + + \draw[<->] (core1.south) -- (shmem); + \draw[<->] (core2.south) -- (shmem); + \draw[<->] (core3.south) -- (shmem); + \draw[<->] (coren.south) -- (shmem); +\end{tikzpicture} +\end{frame} + +\begin{frame}{Step-by-Step} +\begin{itemize} +\item How to maintain simultaneous instruction simulation for all guest + processors?\pause~\textit{Execute no more than one instruction at a time.} +\item It will be extremely slow! Maybe it is possible to simulate multiple guest + instructions without switching?\pause +\item How many?\pause~How much time inter processor communication takes in + hardware? +\end{itemize} +\end{frame} + +% TODO: Add a slide with simulation speed from time quantum dependency using +% qsp-clear-linux.simics + +\begin{frame}[fragile]{Temporal Decoupling --- Real Time} +\begin{center} +\begin{tikzpicture}[>=latex] + \draw[->] (0,0) -- (8,0) node[pos=1, below, align=center] (sim-time) {Real\\Time}; + + \foreach \x in { 1, 2, 3, 4, 5, 6, 7} { + \draw (\x,-0.15) -- (\x,0.15) node (tick\x) {}; + }; + \matrix[anchor=south west] at (-0.5,0.5){ + \node {CPU3}; & & & \node[shape=single arrow, draw, text width = 2cm, inner xsep = 0cm, fill=black!5] (arr3) {}; \\ + \node {CPU2}; & & \node[shape=single arrow, draw, text width = 2cm, inner xsep = 0cm, fill=black!10] (arr2) {}; & \\ + \node {CPU1}; & \node[shape=single arrow, draw, text width = 2cm, inner xsep = 0cm, fill=black!15] (arr1) {}; & & \\ + }; + + \draw[->] (arr1.east) -- (arr2.west); + \draw[->] (arr2.east) -- (arr3.west); +\end{tikzpicture} +\end{center} +\end{frame} + +\begin{frame}[fragile]{Temporal Decoupling --- Simulated Time} +\begin{center} +\begin{tikzpicture}[>=latex] + \draw[->] (0,0) -- (8,0) node[pos=1, below, align=center] (sim-time) {Simulated\\Time}; + + \foreach \x in { 1, 2, 3, 4, 5, 6, 7} { + \draw (\x,-0.15) -- (\x,0.15) node (tick\x) {}; + }; + \matrix[anchor=south west] at (-0.5,0.5){ + \node {CPU3}; & \node[shape=single arrow, draw, text width = 2cm, inner xsep = 0cm, fill=black!5] (arr3) {}; \\ + \node {CPU2}; & \node[shape=single arrow, draw, text width = 2cm, inner xsep = 0cm, fill=black!10] (arr2) {}; \\ + \node {CPU1}; & \node[shape=single arrow, draw, text width = 2cm, inner xsep = 0cm, fill=black!15] (arr1) {}; \\ + }; +\end{tikzpicture} +\end{center} +\end{frame} + +\begin{frame}{Quantum (Quota)} +Quantum (Quota) --- how many instructions a guest processor can run before +giving control back to the simulator. +\vfill +\begin{itemize} +\item A processor can run fewer instruction than dedicated by the quota. +\item Too big quota can cause unexpected behavior. +\item In a DES-based simulator pseudo-events can be used to cause guest + processor switch. +\end{itemize} +\end{frame} + +\begin{frame}[fragile]{Example \No3: \texttt{qsp-clear-linux.simics}} +Quantum on 8 core system: +\begin{lstlisting}[mathescape=true,keywordstyle=\ttfamily] +simics> cpu-switch-time +Current time quantum: 100.0 $\mu$s ++--------------+------------------------+ +|Cycles/quantum| Clock | ++--------------+------------------------+ +| 200000.00|board.mb.cpu0.core[0][0]| +| 200000.00|board.mb.cpu0.core[0][1]| +| 200000.00|board.mb.cpu0.core[1][0]| +| 200000.00|board.mb.cpu0.core[1][1]| +| 200000.00|board.mb.cpu0.core[2][0]| +| 200000.00|board.mb.cpu0.core[2][1]| +| 200000.00|board.mb.cpu0.core[3][0]| +| 200000.00|board.mb.cpu0.core[3][1]| ++--------------+------------------------+ +Default time quantum not set yet +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Example \No4: \texttt{qsp-clear-linux.simics}} +Simulated time on 8 core system: +\begin{verbatim} +running> ptime -all ++------------------------+----------+------------+--------+ +| Processor | Steps | Cycles |Time (s)| ++------------------------+----------+------------+--------+ +|board.mb.cpu0.core[0][0]|1376450747|107696800000| 53.848| +|board.mb.cpu0.core[0][1]| 746604719|107696600220| 53.848| +|board.mb.cpu0.core[1][0]| 746604647|107696600000| 53.848| +|board.mb.cpu0.core[1][1]| 746604686|107696600000| 53.848| +|board.mb.cpu0.core[2][0]| 746604725|107696600000| 53.848| +|board.mb.cpu0.core[2][1]| 746604764|107696600000| 53.848| +|board.mb.cpu0.core[3][0]| 746604803|107696600000| 53.848| +|board.mb.cpu0.core[3][1]| 746604842|107696600000| 53.848| ++------------------------+----------+------------+--------+ +\end{verbatim} +\end{frame} + +\section*{Conclusions} + +\begin{frame}{Conclusions} +\begin{itemize} +\item Software models are created before hardware availability. +\item Software models are used for software-hardware co-development. +\item Simulation provides unique debugging and development capabilities. +\item Simulation with a fixed step. +\item Event-driven simulation. +\item Discrete Event Simulation: + \begin{itemize} + \item Event creation. + \item Event handling. + \item Event destruction. + \end{itemize} +\item Co-simulation for executing and non-executing devices. +\item Multi-processor simulation: + \begin{itemize} + \item Temporal Decoupling. + \item Time quantum. + \end{itemize} +\end{itemize} +\end{frame} + +\begin{frame}[allowframebreaks]{Bibliography} +\begin{thebibliography}{99} + \bibitem{} [RUS] \textit{Речистов~Г.С, Юлюгин~Е.А и др.}, + Программное моделирование вычислительных систем. + \url{https://github.com/grigory-rechistov/simbook/blob/master/metoda/main-web.pdf} + \bibitem{} \textit{James Smith, Ravi Nair}, Virtual machines -- Versatile + Platforms for Systems and Processes. + \bibitem{} \textit{John Wiley \& Sons, Inc., ed. by J. Banks}. Handbook of + Simulation. Principles, Methodology, Advances, Applications, and Practice. + \bibitem{} \textit{J.~Engblom}. Temporal Decoupling - Are “Fast” and + “Correct” Mutually Exclusive? +\end{thebibliography} +\end{frame} + +\begin{frame}{On the Next Lecture:} +Simulation of architectural state: +\begin{itemize} +\item CPU instructions simulation +\end{itemize} +\end{frame} +\finalslide + +\end{document} diff --git a/lectures/crash-course-day2.tex b/lectures/crash-course-day2.tex new file mode 100644 index 0000000..3cbba46 --- /dev/null +++ b/lectures/crash-course-day2.tex @@ -0,0 +1,651 @@ +\input{../common/config} + +\usepackage{tikz} +\usetikzlibrary{shapes, calc, arrows, decorations.markings, decorations.pathreplacing, decorations.pathmorphing, decorations, patterns, chains, snakes, backgrounds, positioning, fit, shadows} +\title{Processor instruction simulation} + +\begin{document} + +\section{Start} +\startslides + +\begin{frame}{Simulated System} +\centering +\vfill +\inputpicture{cpu-mem} +\vfill +\end{frame} + +\section{Interpretation Pipeline} + +% TODO: This is not a pipeline! Add a proper pipeline picture. +% s/pipeline/execution stages/g + +\begin{frame}{Basic 5-Stage Pipeline} +\centering +\inputpicture{interpreter-cycle} +\end{frame} + +\begin{frame}[fragile]{Switched interpreter} +\begin{lstlisting} +while (run) { + raw_code = fetch(PC); + (opcode, operands) = decode(raw_code); + switch (opcode) { + + case opcode1: + func1(operands); PC++; break; + + case opcode2: + func2(operands); PC++; break; + + /*...*/ + } +} +\end{lstlisting} +\end{frame} + +\subsection{Fetch} + +\begin{frame}[fragile]{Fetch} +\texttt{data = mem[pc];}\pause +\vfill +Do not forget about address translation: +\begin{lstlisting} +paddr = v2p(pc); // pc is a virtual address +data = mem[paddr]; +\end{lstlisting} +\end{frame} + +% TODO: Add a slide about paging. People don't know about it at the moment. + +\begin{frame}{Fetch} +<> memory read? +\pause\bigskip +\begin{itemize} +\item Non-execute page. +\pause\bigskip +\item Unaligned accesses cause effects on some architectures. +\pause\bigskip +\item Cross-page accesses. \\ +The pages may have different access rights. +\end{itemize} +\end{frame} + +\subsection{Decode} + +\begin{frame}{Decode} +Decoding --- translation of instruction data from machine code to internal +(high-level) representation suitable for further analysis. +\end{frame} + +\begin{frame}{Example 1: RISC-V} +\centering +\includegraphics[width=.9\textwidth]{risc-v-formats} + +\tiny{Source: The RISC-V Instruction Set Manual, Volume I: Unprivileged ISA, + Document Version 20191213, page 16} +\end{frame} + +\begin{frame}[fragile]{Example 1: RISC-V decoder (1/3)} +\begin{lstlisting} +#define BIT_FIELD(v, e, s) \ + (v >> s) & ((1 << (e - s + 1)) - 1) + +static inline int32_t +sign_extend(uint32_t v, int width) {/* ... */}; + +typedef struct decode { + uint32_t opcode; + uint32_t rd; + uint32_t rs1; + uint32_t rs2; + int32_t imm; + uint32_t funct3; + uint32_t funct7; +} decode_t; +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Example 1: RISC-V decoder (2/3)} +\begin{lstlisting} +decode_t +decode(uint32_t raw) { + uint32_t op = BIT_FIELD(raw, 6, 0); + switch (type(op)) { + case I_type: + return decode_i_type(raw); + case R_type: + return decode_r_type(raw); + /../ + } +} +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Example 1: RISC-V decoder (3/3)} +\begin{lstlisting} +decode_t +decode_i_type(uint32_t raw) { + uint32_t op = BIT_FIELD(raw, 6, 0); + uint32_t rd = BIT_FIELD(raw, 11, 7); + uint32_t funct3 = BIT_FIELD(raw, 14, 12); + uint32_t rs1 = BIT_FIELD(raw, 19, 15); + int32_t imm = sign_extend( + BIT_FIELD(raw, 31, 20), 12); + + return (decode_t){.op = op, .rd = rd, + .funct3 = funct3, .rs1 = rs1, + .imm = imm}; +} +\end{lstlisting} +\end{frame} + +\begin{frame}{Example 3: Intel\reg~IA-32} +\centering +\includegraphics[width=.9\textwidth]{ia32-evex} + +\tiny{J.C.S. Adrian et al. Systems, Apparatuses, and Methods for Blending Two + Source Operands into a Single Destination Using a Writemask. US Patent + Application Publication. \No~2012/0254588 A1} +\end{frame} + +\begin{frame}{What to Fetch From Machine Code?} +\begin{centering} +\inputpicture{instruction-anatomy} +\end{centering} +\vfill +Input: machine code. + +Output: +\begin{itemize} +\item Success, failure, not enough data. +\item In case of success: instruction length. +\item In case of success: information about operands. +\item In case of success: simulation routine. +\end{itemize} +\end{frame} + +\begin{frame}{Decode} +\begin{itemize} +\item Decoders are usually generated from ISA description. +\item In general: classical problem of parser/synax analyser construction. +\item In practice: special tools and languages. +\item Example: Intel\reg~XED (x86 encoder-decoder). \url{https://github.com/intelxed/xed} +\end{itemize} +\end{frame} + +\begin{frame}{Decode: harsh reality} +\begin{itemize} +\item Variable instruction length. Intel\reg~IA-32: from 1 to 15 bytes. How many bytes to decode at once? +\item Decoding results depends of prefixes and execution mode. Example: 0x40-0x4f in Intel\reg~IA-32/Intel\reg~64/AMD64. +\end{itemize} +\end{frame} + +\begin{frame}{Disassemble} +\begin{itemize} +\item Disassemble --- translate from machine code into human readable + representation (mnemonic, assembly). +\item Encode (assemble) --- translate from mnemonic to machine code. +\end{itemize} +\end{frame} + +\subsection{Execute} + +\begin{frame}{Execute} +\begin{itemize} +\item Basic block --- simulation function for one instruction (a.k.a.~service routine). +\item Service routines are tipically written in high-level programming + languages: portable solution. +\item Generators are often used. +\item Example: SimGen --- single discription is used to generate decoder, + disassembler and service routines. +\end{itemize} +\end{frame} + +\begin{frame}[fragile]{Simulated state} +\begin{lstlisting} +typedef struct { + uint32_t pc; + + uint32_t regs[16]; + + bool z_flag; + bool n_flag; + bool o_flag; + bool c_flag; +} cpu_t; +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Example: ADD reg reg reg} +\begin{lstlisting} + +void add32_rrr(cpu_t *cpu, int src1, int src2, int dst) { + cpu->regs[dst] = cpu->regs[src1] + + cpu->regs[src2]; +\end{lstlisting} +\pause + +\begin{lstlisting} + cpu->z_flag = cpu->regs[dst] == 0; + cpu->n_flag = cpu->regs[dst] & (1 << 31); + cpu->o_flag = cpu->regs[dst] < + MAX(cpu->regs[src1], cpu->regs[src2]); + cpu->c_flag = calc_c_flag(cpu->regs[src1], + cpu->regs[src2]); +} +\end{lstlisting} +\end{frame} + +\begin{frame}{Intel\reg~IA-32 CALL} +\centering +\includegraphics[width=\textwidth]{ia32-call} + +\tiny{Source: Intel\reg~64 and IA-32 Architectures Software Developer’s Manual, + Order Number: 325462-073US, pages 716-732.} +\end{frame} + +\subsection{Memory} + +\begin{frame}[fragile]{Memory} +<> memory access: +\vfill +\begin{lstlisting} +write_mem(cpu, dst_addr, data, size); +data = read_mem(cpu, dst_addr, size); +\end{lstlisting} +\pause\vfill +\begin{itemize} +\item Attempt to change read-only memory, +\item Unaligned address, +\item Cross-page access. +\end{itemize} +\end{frame} + +\subsection{Exceptions} + +\begin{frame}{Accurate Pipeline} +\centering +\resizebox{9cm}{7cm}{\inputpicture{interpreter-cycle-exception}} +\end{frame} + +\begin{frame}{Classification} +\begin{itemize} +\item Exception --- synchronous, without repeating of current instruction. +\item Fault --- synchronous, with repeating of current instruction. +\item Trap --- synchronous, without repeating of current instruction, intentoinal. +\item Interrupt --- external, asynchronous. +\item Abort --- external, asynchronous, no return point. +\end{itemize} +\end{frame} + +\subsection{Write-Back} + +\begin{frame}{Write-Back} +\begin{itemize} + \item Processor state should be updated after all excecption checks to avoid + partially changed state. + \bigskip + \item Advance \texttt{\$PC}: + \pause\bigskip + \begin{itemize} + \item For most instructions: \texttt{\$PC += instruction_length}. \\ + Exception: \texttt{REP MOVS}. + \pause\bigskip + \item Explicit \texttt{\$PC} update --- control-flow instructions: + \begin{itemize} + \item (Un)conditional (In)direct Jump/Branch, + \item Call/Return (subroutine). + \item System call/return. + \item ... + \end{itemize} + \end{itemize} +\end{itemize} +\end{frame} + +\section{Improved Interpretation} + +\begin{frame}{Interpretation Pros and Cons} +\begin{itemize} +\item Implemented in high-level language --- portable. +\item Simple structure: reliable, extensible, re-usasble. +\item (Extremely) low simulation speed. +\end{itemize} +\end{frame} + +\begin{frame}[fragile]{Where Is the Time Spent?} +\begin{lstlisting} +start: interruption = false; +while (!interruption) { + raw_code = fetch(PC); + (opcode, operands) = decode(raw_code); // <-- here + switch (opcode) { // <-- and here + case opcode1: + func1(operands); PC++; break; + case opcode2: + func2(operands); PC++; break; + /*...*/ + } +} +handle_interruption(); +goto start; +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Threaded Interpretation} +Jump right to the next instruction instead of start of the loop: +\bigskip +\begin{lstlisting} +func0: /* simulate instr0 */; PC++; + next_opcode = decode(fetch(PC)); + goto func_ptr[next_opcode]; +func1: /* simulate instr1 */; PC++; + next_opcode = decode(fetch(PC)); + goto func_ptr[next_opcode]; +func2: /* simulate instr2 */; PC++; + next_opcode = decode(fetch(PC)); + goto func_ptr[next_opcode]; +\end{lstlisting} + +\tiny\url{http://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-an-unsorted-array} +\end{frame} + +\begin{frame}{Cached Interpretation} +\begin{itemize} +\item Usually guest code is static. +\item It's highly probable that an instruction with some \texttt{\$PC} will be + executed many times. +\item Why decode every time? +\item Solution: create a cache mapping instruction address to decode data. +\end{itemize} +\end{frame} + +\begin{frame}[fragile]{Cached Interpretation} +\begin{lstlisting} +while (!interruption) { + if (operation = cache[PC]); // shortcut + else { // not cached, full path + operation = decode(fetch(PC)); + cache[PC] = operation; // cache the result + } + switch (operation) { + /* ... */ + } +} +\end{lstlisting} +\end{frame} + +% TODO: slide with diagram. + +\begin{frame}{Cached Interpretation} +\begin{itemize} +\item Cache size is limited. +\item Old data needs to be removed from the cache. +\item Code modifications need to be tracked. Otherwise cache will have invalid + data. +\end{itemize} +\end{frame} + +\begin{frame}{What Was Optimized in Interpreter} +\begin{itemize} +\item \textbf<1>{Fetch} {$\leftarrow$ optimized} +\item \textbf<1>{Decode} {$\leftarrow$ optimized} +\item \textbf<2>{Execute} {$\rightarrow$ to be optimized} +\item \textbf<2>{Memory} {$\rightarrow$ to be optimized} +\item \textbf<2>{Write-Back} {$\rightarrow$ to be optimized} +\end{itemize} +\end{frame} + +\section{Binary translation} + +\begin{frame}{Translation, Compilation, Decompilation} +\begin{itemize} +\item \textbf{Translation} --- \textit{generic term} describing a process of + code conversion from one programming language into another. +\item \textbf{Compilation} --- \textit{translation} from high-level programming + language into low-level programming language. +\item \textbf{Decompilation} --- \textit{translation} from low-level programming + language into a high-level programming language. +\end{itemize} +\end{frame} + +\begin{frame}{Binary Translation} +\begin{itemize} +\item Input: guest machine code. +\item Output: host machine code. +\item \textbf{Binary translation, BT} --- translation of guest software written in + guest ISA into equivalent code in host ISA. +\item What for? \pause Repetitive execution of translation result. Optimizations. +\end{itemize} +\end{frame} + +\begin{frame}{Static and Dynamic Binary Translation} +\begin{itemize} +\item \textit{Static} binary translator converts a target executable file + without running it. +\item Result of static BT is saved on disk. +\item It is very difficult to do correctly. +\vfill +\item \textit{Dynamic} binary translation happens during simulation. +\item Result of dynamic BT is saved in memory. +\item Dynamic BT can adopt to program's run-time environment. +\item Dynamic BT alternates with execution of the generated code. +\end{itemize} +\end{frame} + +\begin{frame}{Stages of Dynamic Binary Translation} +\centering +\inputpicture{dynamic-bt} +\end{frame} + +\subsection{Template-Based Translation} + +\begin{frame}{Algorithm 1: Template-Based Translation} +\begin{tikzpicture}[font=\scriptsize, >=latex, node distance=2.cm] + +\node[draw, double copy shadow={shadow xshift=3pt,shadow yshift=-3pt}, fill=white] (decode) {decode_t}; +\node[rectangle split, rectangle split parts=4, draw, right=of decode, anchor=text west, minimum width=1.8cm] (templates-raw) {Template 1\nodepart{two} Template 2\nodepart{three} Template 3\nodepart{four} Template 4}; + +\node[rectangle split, rectangle split parts=4, draw, right=of templates-raw, minimum width=1.8cm] (templates) {Capsule 1\nodepart{two} Capsule 2\nodepart{three} Capsule 3\nodepart{four} Capsule 4}; + +\node[draw, above=1cm of templates-raw, align=left] (md) {Encodings and Offsets\\for Host Instruction\\Operands}; + +\draw[->] (decode) -- (templates-raw.text west) node[midway, above]{\tiny Template Selection}; +\draw[->] (templates-raw) -- (templates); +\coordinate[above=0.5cm of templates] (junction); +\draw[->] (decode) |- (junction) -- (templates) node[pos=0, above]{\tiny Argument Substitution}; +\draw[] (md) |- (junction); +\end{tikzpicture} +\end{frame} + +\begin{frame}[fragile]{Algorithm 1: Template-Based Translation} +\begin{tiny} +\begin{itemize} + \item start_addr --- guest code's start address, + \item start_buf --- host buffer. +\end{itemize} +\end{tiny} + +\begin{lstlisting} +translate(start_addr, start_buf) { + PC = start_addr; bufptr = start_buf; + while (!enough) { + instr = fetch(PC); + (opcode, operands) = decode(instr); + (template, length) = templates[opcode]; + memcpy(bufptr, template, length); + patch_operands(bufptr, operands); + PC += instr_length; + bufptr += length; + } + memcpy(bufptr, glue_capsule, glue_length); +} +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Algorithm 1: Execution} + +\begin{lstlisting} +execute(start_buf) { + load_simulated_state(); + goto start_buf; +} +\end{lstlisting} +\pause +or +\begin{lstlisting} +typedef void (*fblock)(void); +execute(start_buf) { + load_simulated_state(); + ((fblock)start_buf)(); +} +\end{lstlisting} +\end{frame} + +% TODO: Add JIT-template for the same ADDQ instruction + +\begin{frame}{Capsule} +\begin{small} +\begin{tabular}{p{0.45\textwidth}p{0.45\textwidth}} +Guest code, Intel~64 (64-bit) & Host code, Intel~IA-32 (32-bit) +\end{tabular} +\end{small} +\vfill +\centering +\inputpicture{capsule} +\pause +Question: What part of \texttt{ADDQ} semantics is missing? +\end{frame} + +\begin{frame}{Argument Substitution} + +{\ttfamily\small +{\sffamily Registers:} + +\begin{tabular}{ll} +c5 f4 58 c\textcolor{red}{8} & vaddps \textcolor{red}{\%ymm0},\%ymm1,\%ymm1 \\ +c5 f4 58 c\textcolor{red}{9} & vaddps \textcolor{red}{\%ymm1},\%ymm1,\%ymm1 \\ +c5 f4 58 c\textcolor{red}{f} & vaddps \textcolor{red}{\%ymm7},\%ymm1,\%ymm1 \\\pause +c4 c1 74 58 c\textcolor{red}{8} & vaddps \textcolor{red}{\%ymm8},\%ymm1,\%ymm1 \\ +c4 c1 74 58 c\textcolor{red}{f} & vaddps \textcolor{red}{\%ymm15},\%ymm1,\%ymm1 \\ +c5 f4 58 c8 & vaddps \%ymm0,\%ymm1,\%ymm1 \\ +c5 ec 58 d0 & vaddps \%ymm0,\%ymm2,\%ymm2 \\ +c5 c4 58 f8 & vaddps \%ymm0,\%ymm7,\%ymm7 \\\pause +c4 e1 74 58 c8 & vaddps \%ymm0,\%ymm1,\%ymm1 \# Mnemonic is the same!\\ +\end{tabular} +\pause +{\sffamily Literals:} +\begin{tabular}{ll} +67 c7 85 \textcolor{blue}{00 01 00 00} \textcolor{green}{dd cc bb aa} & movl \textcolor{green}{\$0xaabbccdd},\textcolor{blue}{0x100}(\%ebp) +\end{tabular} +} + +\end{frame} + +\subsection{Translation with Intermediate Representation} + +\begin{frame}{Algorithm 2: JIT. IR generation} +\centering +\begin{tikzpicture}[font=\small, >=latex] + +\node[rectangle split, rectangle split parts=2, draw, double copy shadow={shadow xshift=3pt,shadow yshift=-3pt}, fill=white] (sr) {Simulation routine\nodepart{two} С (subset)}; +\node[rectangle split, rectangle split parts=2, draw, below=1cm of sr, double copy shadow={shadow xshift=3pt,shadow yshift=-3pt}, fill=white] (template) {Template\nodepart{two} IR: bytecode+SSA}; +\draw[->] (sr) -- (template) node[midway, right] {SR-compiler}; + +\end{tikzpicture} +\end{frame} + +\begin{frame}{Algorithm 2: JIT. Simulation Stage} +\begin{tikzpicture}[font=\scriptsize, >=latex, node distance=1.cm] + +\node[draw, double copy shadow={shadow xshift=3pt,shadow yshift=-3pt}, fill=white] (decode) {decode_t}; +\node[rectangle split, rectangle split parts=4, draw, right=of decode, anchor=text west, minimum width=1.8cm] (templates) {Template 1\nodepart{two} Template 2\nodepart{three} Template 3\nodepart{four} Template 4}; +\draw[->] (decode) -- (templates.text west); + +\node[draw, below=0.5cm of templates, minimum height=2cm, align=left, minimum width=1.8cm] (opt-template) {Block\\template}; +\draw[->] (templates) -- (opt-template) node[midway, right] {Optimization}; + +\node[rectangle split, rectangle split parts=3, align=left, draw, below=0.5cm of opt-template, double copy shadow={shadow xshift=3pt,shadow yshift=-3pt}, fill=white] (md) {Machine Description (md)\nodepart{two} Bytecode $\rightarrow$ machine code\nodepart{three} Host Register Definition}; + +\coordinate[right=2.5cm of opt-template] (junction); + +\node[draw, right=1cm of junction, rectangle split, rectangle split parts=2,] (host-code) {Translation Block\nodepart{two} Host machine code}; + +\draw[->] (md) -| node[pos=0.5, right, align=left] {Register Allocation\\Code Generation} (junction) -- (host-code); +\draw[] (opt-template) -- (junction); + +\end{tikzpicture} +\end{frame} + +\begin{frame}{Optimizations} +\centering +\inputpicture{bt-optimization} +\end{frame} + +\begin{frame}{Connection between Translation Blocks} +\centering +\inputpicture{bb-translation} +\end{frame} + +\begin{frame}{Why Optimizations During BT Are Complicated?} + +\begin{itemize} +\item Machine code has less information about the algorithm compared to code in + high-level programming languages. +\item Many compiler optimizations cannot be used. +% TODO: Examples with clarification. +\item BT optimizations are limited in time. +\end{itemize} +\pause +\begin{itemize} +\item Variable addresses --- not available. +\item Function boundaries --- not available. +\item Branch addresses --- partially known. +\end{itemize} +\end{frame} + +\section*{Conclusions} + +\begin{frame}{Conclusions} +\begin{itemize} +\item Basic 5-stage pipeline. +\item Decoder, disassembler, encoder. +\item Switched interpreter. +\item Threaded interpreter. +\item Cached interpreter. +\item Exeption, Interrupt, Trap, Fault\dots +\item Interpretation, Compilation, Translation. +\item Binary Translation. +\item Static and Dynamic Binary Translation. +\item Template, Capsule. +\item Intermediate Representation. +\end{itemize} +\end{frame} + +\begin{frame}[allowframebreaks]{Bibliography} +\begin{thebibliography}{99} + \bibitem{} \textit{D. Mihoka, S. Shwartsman}. Virtualization Without Direct + Execution or Jitting: Designing a Portable Virtual Machine Infrastructure. + \bibitem{} \textit{Y. Lifshitz, R. Cohn, I. Livni, O. Tabach, M. Charney, K. + Hazelwood}. Zsim: A Fast Architectural Simulator for ISA Design-Space + Exploration. + \bibitem{} \textit{F. Larsson, P. Magnusson, B. Werner}. SimGen: Development of + Efficient Instruction Set Simulators. + \bibitem{} \textit{A. Sepp, J. Kranz, A. Simon}. GDSL: A Generic Decoder + Specification Language for Interpreting Machine Language. + \bibitem{} \textit{Jim Smith and Ravi Nair}. Virtual Machines: Versatile + Platforms for Systems and Processes. + \bibitem{} \textit{Fabrice Bellard}. QEMU, a Fast and Portable Dynamic + Translator. + \bibitem{} \textit{Anton Chernoff and Ray Hookway.} {DIGITAL FX!32} Running + 32-Bit x86 Applications on {Alpha} {NT}. + \bibitem{} \textit{Leonid Baraz [et al.]} IA-32 Execution Layer: a Two-Phase + Dynamic Translator Designed to Support IA-32 Applications on + Itanium\reg-Based Systems. +\end{thebibliography} +\end{frame} + +\finalslide + +\end{document}