more explanation in parsing

jsiek · jsiek · commit dbd950a74816 · 2022-06-26T13:17:23.000-04:00
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@
 LATEXMK= latexmk -pdf
 
 all:
-	$(LATEXMK) book
+	$(LATEXMK) -f book
 
 cont: continuous
 continuous:
diff --git a/book.bib b/book.bib
@@ -1,52 +1,35 @@
 @book{Tomita:1985qr,
-	address = {Norwell, MA, USA},
 	author = {Masaru Tomita},
-	date-added = {2008-12-02 14:16:33 -0700},
-	date-modified = {2008-12-02 14:16:39 -0700},
-	isbn = {0898382025},
 	publisher = {Kluwer Academic Publishers},
 	title = {Efficient Parsing for Natural Language: A Fast Algorithm for Practical Systems},
 	year = {1985}}
 
 @article{Earley:1970ly,
-	acmid = {362035},
-	address = {New York, NY, USA},
 	author = {Earley, Jay},
-	date-added = {2011-05-28 11:31:46 -0600},
-	date-modified = {2011-05-28 11:31:48 -0600},
-	doi = {http://doi.acm.org/10.1145/362007.362035},
-	issn = {0001-0782},
 	issue = {2},
 	journal = {Commun. ACM},
-	keywords = {compilers, computational complexity, context-free grammar, parsing, syntax analysis},
 	month = {February},
 	numpages = {9},
 	pages = {94--102},
 	publisher = {ACM},
 	title = {An efficient context-free parsing algorithm},
-	url = {http://doi.acm.org/10.1145/362007.362035},
 	volume = {13},
-	year = {1970},
-	Bdsk-File-1 = {YnBsaXN0MDDRAQJccmVsYXRpdmVQYXRoXnA5NC1lYXJsZXkucGRmCAsYAAAAAAAAAQEAAAAAAAAAAwAAAAAAAAAAAAAAAAAAACc=},
-	Bdsk-Url-1 = {http://doi.acm.org/10.1145/362007.362035}}
+	year = {1970}}
 
-@Book{Hopcroft06:_automata,
+@book{Hopcroft06:_automata,
   author = 	 {John Hopcroft and Rajeev Motwani and Jeffrey Ullman},
   title = 	 {Introduction to Automata Theory, Languages, and Computation},
   publisher = 	 {Pearson},
   year = 	 2006}
 
 @techreport{Lesk:1975uq,
 	author = {M. E. Lesk and E. Schmidt},
-	date-added = {2007-08-27 13:37:27 -0600},
-	date-modified = {2009-08-25 22:28:17 -0600},
 	institution = {Bell Laboratories},
 	month = {July},
 	title = {Lex - A Lexical Analyzer Generator},
-	year = {1975},
-	Bdsk-File-1 = {YnBsaXN0MDDRAQJccmVsYXRpdmVQYXRoV2xleC5wZGYICxgAAAAAAAABAQAAAAAAAAADAAAAAAAAAAAAAAAAAAAAIA==}}
+	year = {1975}}
 
-@Misc{shinan20:_lark_docs,
+@misc{shinan20:_lark_docs,
   author = 	 {Erez Shinan},
   title = 	 {Lark Documentation},
   url = {https://lark-parser.readthedocs.io/en/latest/index.html},
diff --git a/book.tex b/book.tex
@@ -196,6 +196,7 @@
 
 %\listoftables
 
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \chapter*{Preface}
 \addcontentsline{toc}{fmbm}{Preface}
@@ -247,7 +248,7 @@ \chapter*{Preface}
   the fundamental tools of compiler construction: \emph{abstract
     syntax trees} and \emph{recursive functions}. 
 {\if\edition\pythonEd
-\item In Chapter~\ref{ch:parsing-Lvar} we learn how to use the Lark
+\item In Chapter~\ref{ch:parsing} we learn how to use the Lark
   parser generator to create a parser for the language of integer
   arithmetic and local variables. We learn about the parsing
   algorithms inside Lark, including Earley and LALR(1).
@@ -307,14 +308,13 @@ \chapter*{Preface}
 mathematics.
 %
 At the beginning of the course, students form groups of two to four
-people.  The groups complete one chapter every two weeks, starting
-with chapter~\ref{ch:Lvar} and finishing with
-chapter~\ref{ch:Llambda}. Many chapters include a challenge problem
-that we assign to the graduate students. The last two weeks of the
+people.  The groups complete approximately one chapter every two
+weeks, starting with chapter~\ref{ch:Lvar}. The last two weeks of the
 course involve a final project in which students design and implement
 a compiler extension of their choosing.  The last few chapters can be
-used in support of these projects.  For compiler courses at
-universities on the quarter system (about ten weeks in length), we
+used in support of these projects.  Many chapters include a challenge
+problem that we assign to the graduate students. For compiler courses
+at universities on the quarter system (about ten weeks in length), we
 recommend completing the course through chapter~\ref{ch:Lvec} or
 chapter~\ref{ch:Lfun} and providing some scaffolding code to the
 students for each compiler pass.
@@ -337,7 +337,6 @@ \chapter*{Preface}
 Technology, University of Freiburg, University of Massachusetts
 Lowell, and the University of Vermont.
 
-
 \begin{figure}[tp]
 \begin{tcolorbox}[colback=white]
   {\if\edition\racketEd
@@ -370,32 +369,35 @@ \chapter*{Preface}
 \fi}
 {\if\edition\pythonEd
 \begin{tikzpicture}[baseline=(current  bounding  box.center)]
-  \node (C1) at (0,1.5) {\small Ch.~\ref{ch:trees-recur} Preliminaries};
-  \node (C2) at (4,1.5) {\small Ch.~\ref{ch:Lvar} Variables};
-  \node (C3) at (8,1.5) {\small Ch.~\ref{ch:register-allocation-Lvar} Registers};
-  \node (C4) at (0,0) {\small Ch.~\ref{ch:Lif} Conditionals};
-  \node (C5) at (4,0) {\small Ch.~\ref{ch:Lvec} Tuples};
-  \node (C6) at (8,0) {\small Ch.~\ref{ch:Lfun} Functions};
-  \node (C9) at (0,-1.5) {\small Ch.~\ref{ch:Lwhile} Loops};
-  \node (C8) at (4,-1.5) {\small Ch.~\ref{ch:Ldyn} Dynamic};
+  \node (Prelim) at (0,1.5) {\small Ch.~\ref{ch:trees-recur} Preliminaries};
+  \node (Var) at (4,1.5) {\small Ch.~\ref{ch:Lvar} Variables};
+  \node (Parse) at (8,1.5) {\small Ch.~\ref{ch:parsing} Parsing};
+  \node (Reg) at (0,0) {\small Ch.~\ref{ch:register-allocation-Lvar} Registers};
+  \node (Cond) at (4,0) {\small Ch.~\ref{ch:Lif} Conditionals};
+  \node (Loop) at (8,0) {\small Ch.~\ref{ch:Lwhile} Loops};
+  \node (Fun) at (0,-1.5) {\small Ch.~\ref{ch:Lfun} Functions};
+  \node (Tuple) at (4,-1.5) {\small Ch.~\ref{ch:Lvec} Tuples};
+  \node (Dyn) at (8,-1.5) {\small Ch.~\ref{ch:Ldyn} Dynamic};
 %  \node (CO) at (0,-3) {\small Ch.~\ref{ch:Lobject} Objects};
-  \node (C7) at (8,-1.5) {\small Ch.~\ref{ch:Llambda} Lambda};
-  \node (C10) at (4,-3) {\small Ch.~\ref{ch:Lgrad} Gradual Typing};
-  \node (C11) at (8,-3) {\small Ch.~\ref{ch:Lpoly} Generics};
-
-  \path[->] (C1) edge [above] node {} (C2);
-  \path[->] (C2) edge [above] node {} (C3);
-  \path[->] (C3) edge [above] node {} (C4);
-  \path[->] (C4) edge [above] node {} (C5);
-  \path[->,style=dotted] (C5) edge [above] node {} (C6);
-  \path[->] (C5) edge [above] node {} (C7);
-  \path[->] (C6) edge [above] node {} (C7);
-  \path[->] (C4) edge [above] node {} (C8);
-  \path[->] (C4) edge [above] node {} (C9);
-  \path[->] (C7) edge [above] node {} (C10);
-  \path[->] (C8) edge [above] node {} (C10);
-%  \path[->] (C8) edge [above] node {} (CO);
-  \path[->] (C10) edge [above] node {} (C11);
+  \node (Lam) at (0,-3) {\small Ch.~\ref{ch:Llambda} Lambda};
+  \node (Gradual) at (4,-3) {\small Ch.~\ref{ch:Lgrad} Gradual Typing};
+  \node (Generic) at (8,-3) {\small Ch.~\ref{ch:Lpoly} Generics};
+
+  \path[->] (Prelim) edge [above] node {} (Var);
+  \path[->] (Var) edge [above] node {} (Reg);
+  \path[->] (Var) edge [above] node {} (Parse);
+  \path[->] (Reg) edge [above] node {} (Cond);
+  \path[->] (Cond) edge [above] node {} (Tuple);
+  \path[->,style=dotted] (Tuple) edge [above] node {} (Fun);
+  \path[->] (Cond) edge [above] node {} (Fun);
+  \path[->] (Tuple) edge [above] node {} (Lam);
+  \path[->] (Fun) edge [above] node {} (Lam);
+  \path[->] (Cond) edge [above] node {} (Dyn);
+  \path[->] (Cond) edge [above] node {} (Loop);
+  \path[->] (Lam) edge [above] node {} (Gradual);
+  \path[->] (Dyn) edge [above] node {} (Gradual);
+%  \path[->] (Dyn) edge [above] node {} (CO);
+  \path[->] (Gradual) edge [above] node {} (Generic);
 \end{tikzpicture}
 \fi}
 \end{tcolorbox}
@@ -506,9 +508,11 @@ \chapter{Preliminaries}
   syntax}\index{subject}{abstract syntax
   tree}\index{subject}{AST}\index{subject}{program}\index{subject}{parse}
 The process of translating from concrete syntax to abstract syntax is
-called \emph{parsing}~\citep{Aho:2006wb}\python{ and is studied in
-  chapter~\ref{ch:parsing-Lvar}}.
-\racket{This book does not cover the theory and implementation of parsing.}%
+called \emph{parsing}\python{ and is studied in
+  chapter~\ref{ch:parsing}}.
+\racket{This book does not cover the theory and implementation of parsing.
+  We refer the readers interested in parsing to the thorough treatment
+  of parsing by \citet{Aho:2006wb}.}%
 %
 \racket{A parser is provided in the support code for translating from
   concrete to abstract syntax.}%
@@ -4090,23 +4094,23 @@ \section{Challenge: Partial Evaluator for \LangVar{}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 {\if\edition\pythonEd
 \chapter{Parsing}
-\label{ch:parsing-Lvar}
+\label{ch:parsing}
 \setcounter{footnote}{0}
 \index{subject}{parsing}
 
 In this chapter we learn how to use the Lark parser
-generator~\citep{shinan20:_lark_docs} to translate the concrete syntax
+framework~\citep{shinan20:_lark_docs} to translate the concrete syntax
 of \LangInt{} (a sequence of characters) into an abstract syntax tree.
 You will then be asked to use Lark to create a parser for \LangVar{}.
-We then learn about the parsing algorithms used inside Lark, studying
-the \citet{Earley:1970ly} and LALR algorithms.
+We also describe the parsing algorithms used inside Lark, studying the
+\citet{Earley:1970ly} and LALR(1) algorithms.
 
-A parser generator takes in a specification of the concrete syntax and
-produces a parser. Even though a parser generator does most of the
-work for us, using one properly requires some knowledge.  In
-particular, we must learn about the specification languages used by
-parser generators and we must learn how to deal with ambiguity in our
-language specifications.
+A parser framework such as Lark takes in a specification of the
+concrete syntax and the input program and produces a parse tree. Even
+though a parser framework does most of the work for us, using one
+properly requires some knowledge.  In particular, we must learn about
+its specification languages and we must learn how to deal with
+ambiguity in our language specifications.
 
 The process of parsing is traditionally subdivided into two phases:
 \emph{lexical analysis} (also called scanning) and \emph{syntax
@@ -4119,16 +4123,16 @@ \chapter{Parsing}
 the use of a faster but less powerful algorithm for lexical analysis
 and the use of a slower but more powerful algorithm for parsing.
 %
-Likewise, parser generators typical come in pairs, with separate
-generators for the lexical analyzer (or lexer for short) and for the
-parser.  A paricularly influential pair of generators were
-\texttt{lex} and \texttt{yacc}. The \texttt{lex} generator was written
-by \citet{Lesk:1975uq} at Bell Labs. The \texttt{yacc} generator was
-written by \citet{Johnson:1979qy} at AT\&T and stands for Yet Another
-Compiler Compiler.
-
-The Lark parse generator that we use in this chapter includes both a
-lexical analyzer and a parser. The next section discusses lexical
+%% Likewise, parser generators typical come in pairs, with separate
+%% generators for the lexical analyzer (or lexer for short) and for the
+%% parser.  A paricularly influential pair of generators were
+%% \texttt{lex} and \texttt{yacc}. The \texttt{lex} generator was written
+%% by \citet{Lesk:1975uq} at Bell Labs. The \texttt{yacc} generator was
+%% written by \citet{Johnson:1979qy} at AT\&T and stands for Yet Another
+%% Compiler Compiler.
+%
+The Lark parse framwork that we use in this chapter includes both
+lexical analyzers and parsers. The next section discusses lexical
 analysis and the remainder of the chapter discusses parsing.
 
 
@@ -4522,10 +4526,13 @@ \section{The Earley Algorithm}
 more efficient but can only handle a subset of the context-free
 grammars.
 
-The Earley algorithm uses a data structure called a
-\emph{chart}\index{subject}{chart} to keep track of its progress.  The
-chart is an array with one slot for each position in the input string,
-where position $0$ is before the first character and position $n$ is
+The Earley algorithm can be viewed as an interpreter; it treats the
+grammar as the program being interpreted and it treats the concrete
+syntax of the program-to-be-parsed as its input.  The Earley algorithm
+uses a data structure called a \emph{chart}\index{subject}{chart} to
+keep track of its progress and to memoize its results. The chart is an
+array with one slot for each position in the input string, where
+position $0$ is before the first character and position $n$ is
 immediately after the last character. So the array has length $n+1$
 for an input string of length $n$. Each slot in the chart contains a
 set of \emph{dotted rules}. A dotted rule is simply a grammar rule
@@ -4553,8 +4560,8 @@ \section{The Earley Algorithm}
 \begin{lstlisting}
   lang_int: . stmt_list         (0)
 \end{lstlisting}
-in slot $0$ of the chart. The algorithm then proceeds to its
-\emph{prediction} phase in which it adds more dotted rules to the
+in slot $0$ of the chart. The algorithm then proceeds to with
+\emph{prediction} actions in which it adds more dotted rules to the
 chart based on which nonterminal come after a period. In the above,
 the nonterminal \code{stmt\_list} appears after a period, so we add all
 the rules for \code{stmt\_list} to slot $0$, with a period at the
@@ -4767,13 +4774,15 @@ \section{The Earley Algorithm}
 \section{The LALR(1) Algorithm}
 \label{sec:lalr}
 
-The LALR(1) algorithm consists of a finite automata and a stack to
-record its progress in parsing the input string.  Each element of the
-stack is a pair: a state number and a grammar symbol (a terminal or
-nonterminal). The symbol characterizes the input that has been parsed
-so-far and the state number is used to remember how to proceed once
-the next symbol-worth of input has been parsed.  Each state in the
-finite automata represents where the parser stands in the parsing
+The LALR(1) algorithm can be viewed as a two phase approach in which
+it first compiles the grammar into a state machine and then runs the
+state machine to parse the input string.  The state machine also uses
+a stack to record its progress in parsing the input string.  Each
+element of the stack is a pair: a state number and a grammar symbol (a
+terminal or nonterminal). The symbol characterizes the input that has
+been parsed so-far and the state number is used to remember how to
+proceed once the next symbol-worth of input has been parsed.  Each
+state in the machine represents where the parser stands in the parsing
 process with respect to certain grammar rules. In particular, each
 state is associated with a set of dotted rules.
 
@@ -4797,7 +4806,7 @@ \section{The LALR(1) Algorithm}
 \emph{item}. There are several rules that could apply next, both rule
 2 and 3, so state 1 also shows those rules with a period at the
 beginning of their right-hand sides. The edges between states indicate
-which transitions the automata should make depending on the next input
+which transitions the machine should make depending on the next input
 token. So, for example, if the next input token is \code{INT} then the
 parser will push \code{INT} and the target state 4 on the stack and
 transition to state 4.  Suppose we are now at the end of the input. In
@@ -10155,7 +10164,7 @@ \subsection{Optimize Blocks}
 the constant \TRUE{} in \code{explicate\_pred}, in which we discard the
 \code{els} continuation.
 %
- {\if\edition\racketEd
+{\if\edition\racketEd
 The following example program falls into this
 case, and it creates two unused blocks.       
 \begin{center}
@@ -10277,11 +10286,12 @@ \subsection{Optimize Blocks}
       [else
         (let ([label (gensym 'block)])
           (set! basic-blocks (cons (cons label t) basic-blocks))
-          (Goto label))]))
+          (Goto label))])))
 \end{lstlisting}
 \end{minipage}
 \end{center}
 \fi}
+
 {\if\edition\pythonEd
 %
 Here is the new version of the \code{create\_block} auxiliary function
@@ -20663,6 +20673,7 @@ \section{Type Checking \LangGrad{}}
 
 \fi}
 
+
 \clearpage
 
 \section{Interpreting \LangCast{}}
@@ -20780,7 +20791,7 @@ \section{Interpreting \LangCast{}}
 from \CANYTY{} to \INTTY{}.
 }
 \python{
-  For the subscript \code{v[i]} in \code{f([v[i])} of \code{map\_inplace},
+  For the subscript \code{v[i]} in \code{f(v[i])} of \code{map\_inplace},
   the proxy casts the integer from \INTTY{} to \CANYTY{}.
   For the subscript on the left of the assignment,
   the proxy casts the tagged value from \CANYTY{} to \INTTY{}.