NLP-Advanced-Text-Processing-Ass.tex


\documentclass[11pt]{article}
%\usepackage[top=20mm,left=20mm,right=20mm,bottom=15mm,a4paper]{geometry} % see geometry.pdf on how to lay out the page. There's lots.
\usepackage[top=20mm,left=20mm,right=20mm,bottom=15mm,headsep=15pt,footskip=15pt,a4paper]{geometry} % see geometry.pdf on how to lay out the page. There's lots.
%\geometry{a4paper} % or letter or a5paper or ... etc
% \geometry{landscape} % rotated page geometry
\usepackage[round]{natbib}
\setlength{\bibsep}{0.0pt}
\usepackage{color}
\usepackage{times}
%\usepackage[T1]{fontenc}
%\usepackage{mathptmx}
\usepackage{tikz-dependency}
\usepackage{enumitem}
%\usepackage{times}
\usepackage{here}

\usepackage[procnames]{listings}
\usepackage{color}
\usepackage{todonotes}
 
 
% See the ``Article customise'' template for come common customisations
\newcommand{\refeq}[1]{Equation~\ref{eq:#1}}
\newcommand{\reffig}[1]{Figure~\ref{fig:#1}}
\newcommand{\reftab}[1]{Table~\ref{tab:#1}}
\newcommand{\refsec}[1]{\textsection\ref{sec:#1}}
\newcommand{\newsec}[1]{\section{#1}\noindent}
%\newcommand{\newsec}[2]{\section{#1}\label{sec:#2}\noindent}
\newcommand{\newsubsec}[2]{\subsection{#1}\label{sec:#2}\noindent}
\newcommand{\argmax}{\operatornamewithlimits{argmax}} 
\newcommand{\argmin}{\operatornamewithlimits{argmin}} 

\makeatletter         
\def\@maketitle{   % custom maketitle 
\begin{center}%
{\bfseries \@title}%
{\bfseries \@author}%
\end{center}%
\smallskip \hrule \bigskip }

% custom section 
\renewcommand{\section}{\@startsection
{section}%                   % the name
{1}%                         % the level
{0mm}%                       % the indent
{-0.8\baselineskip}%            % the before skip
{0.3\baselineskip}%          % the after skip
{\bfseries\large}}% the style

% custom subsection 
\renewcommand{\subsection}{\@startsection
{subsection}%                   % the name
{2}%                         % the level
{0mm}%                       % the indent
{-0.8\baselineskip}%            % the before skip
{0.3\baselineskip}%          % the after skip
{\bfseries\large}}% the style

\renewcommand{\paragraph}{%
  \@startsection{paragraph}{4}%
  {\z@}{1.5ex \@plus 1ex \@minus .2ex}{-1em}%
  {\normalfont\normalsize\bfseries}%
}\makeatother

% taken from MdL
\newenvironment{titlemize}[1]{%
    \paragraph{#1}
    \begin{itemize}
        \setlength\itemsep{0pt}}
    {\end{itemize}}


%\title{{\LARGE Universal Parser (UP)}\\[-8mm]
%\includegraphics[height=8mm]{RUPA}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\includegraphics[height=8mm]{RUPA}}
\title{{\LARGE Natural Language Processing}\\[1.5mm]{\large Assignment 2: Advanced Text Processing}}%\\Lemmatisation and Part-of-Speech-Tagging}}
\author{}
\date{} % delete this line to display the current date

%%% BEGIN DOCUMENT
\begin{document}

\definecolor{keywords}{RGB}{255,0,90}
\definecolor{comments}{RGB}{0,0,113}
\definecolor{red}{RGB}{160,0,0}
\definecolor{green}{RGB}{0,150,0}
\definecolor{UUlight}{RGB}{230,230,230}
\definecolor{UUmedium}{RGB}{190,190,190}
\definecolor{UUdark}{RGB}{130,130,130}
\definecolor{UUred}{RGB}{153,0,0}

\lstset{language=Python, 
        basicstyle=\ttfamily\small, 
        keywordstyle=\color{keywords},
        commentstyle=\color{comments},
        stringstyle=\color{red},
        showstringspaces=false,
        identifierstyle=\color{green},
        procnamekeys={def,class}}

\maketitle
%\tableofcontents
%\vspace{3mm}

\section{Introduction}
\noindent This assignment involves material from lectures 5, 6 and 7. You should
have watched the relevant videos, read the relevant chapters in the textbook and
made a serious attempt at completing the relevant labs before you attempt this
assignment. If you feel you have done that and still find the instructions
unclear, you are welcome to email the course teachers and/or go to office hours
to ask for help. The assignment is split into 2 sections, one about POS tagging,
one about Lemmatisation. Each section is worth 10 points. If you are interested
in receiving a VG grade, you may complete the \textit{optional} exercise
available to you, which is related to Hidden Markov Models. Note that the exercise
must be completed in full in order to receive VG credit. We expect between half
a page and a page for each section, except when stated otherwise. Please do not
submit more than 5 pages overall. Your answers for each section should be
self-contained. %\todo[inline,
%color=blue!40]{FC: Right now, the assignment consists of many smaller
%  questions. Can/Should we make them fewer bigger questions?}

\newsec{POS-Tagging}%
\begin{itemize}
\item In Lab 5, you have tuned a tagger. Based on the best version of
  your tagger, you should perform a manual error analysis where you go
  through at least 5 sentences and comment on the errors made by the
  tagger. Are the mistagged words genuinely ambiguous? Why do you
  think they were mistagged? Is it possible that some of the words are
  mistagged in the gold standard? \textcolor{UUred}{[ca 1/2 page]}.\footnote{If you need more information about the tagset, go to http://universaldependencies.org/u/pos/index.html.}
\item What tagsets exist for your native language? List the ones you
  can find (spend not more than 15min on the search) and describe one
  of them in more detail.
  % \textcolor{UUred}{[$>$ 10
%    sentences]} %\\\textbf{\textcolor{blue}{[tagsets $|$
                            %own web search]}}
\item Is it necessary to tokenize text before tagging it? Please
  motivate your answer and give at least one
  example.% \textcolor{UUred}{[$>$ 10
%    sentences]} %\\\textbf{\textcolor{blue}{[Read the book]}}
\item In the HMM lab we have investigated key sequences and predicted words. 
What do these correspond to when using HMMs for POS-tagging? Please motivate your answer. %\textcolor{UUred}{[$>$ 10
%    sentences]}
  %\\\textbf{\textcolor{blue}{[HMMs $|$ Back-ref to lab]}}
\end{itemize}

%\todo[inline, color=blue!40]{FC: We should discuss whether it is OK to
%  keep these individual recommendations for answer lengths. I saw that
%  I am the only one using it.}

\newsec{Lemmatisation}%
%What are they supposed to do?

\begin{itemize}
\item In Lab 7, you have tuned a lemmatizer. Based on the best version
  of your lemmatizer, you should do a manual analysis of remaining
  errors. Describe at least 5 error types and discuss how they could
  be tackled in a more sophisticated lemmatizer. \textcolor{UUred}{[ca
    1/2 page]}
\item Lemmatizers are often implemented as finite-state-transducers
  (FSTs). While this kind of implementation is beyond the scope of
  this course, Chapter 3.5 (FSTs for Morphological Parsing) of our
  course book gives examples of how FSTs can be visualised. Draw an
  FST based on the initial lemmatizer we gave you in Lab 7 (repeated
  on the last page of this assignment) that can analyse the following
  words: \texttt{cats NOUN}, \texttt{jumped VERB}, \texttt{higher
    ADJ}. You can \textbf{either} draw the FST by hand, take a picture, and
  transform it to \texttt{.pdf}, \textbf{or} use a drawing program
  (e.g., xfig or MS Paint) and transform the output into
  \texttt{.pdf}. Regardless of how you produce the drawing, it must be included in your submission and 
  \textbf{not} be submitted as a separate file. Please describe
  your drawing. %\textcolor{UUred}{[$>$ 10 sentences]}
\item Why is it more difficult to tag morphologically rich languages?
  Please reflect and motivate your answer. %\textcolor{UUred}{[min. 10
%    sentences]}
\end{itemize}
\clearpage

\newsec{VG: Hidden Markov Models}

Consider the following \texttt{TRAIN} ``corpus'':

\begin{verbatim}
she/PRON books/VERB trips/NOUN for/ADP kids/NOUN
she/PRON kids/VERB you/PRON all/DET the/DET time/NOUN
kids/NOUN like/VERB books/NOUN like/ADP these/PRON
\end{verbatim}

\noindent Compute the transition and emission probabilities of a 1st order Hidden Markov
Model POS-tagger using this data. Show your results in separate tables.
\\
\\
\noindent Now, consider the following \texttt{TEST} sentence:

\begin{verbatim}
kids like books
\end{verbatim}

\noindent Report the most likely tag sequence for this sentence, as well as its
probability given the \texttt{TRAIN} corpus. Show your work by drawing a Viterbi
trellis. 

\section{Grading Criteria}
To pass the assignment, you must meet all the basic criteria on all
subparts of the assignment.  To get VG, you must in addition meet some
of the additional criteria for most of subparts.

\begin{titlemize}{Basic Criteria}
    \item Answers are given in understandable English.
    \item Answers are stated clearly and coherently.
    \item Answers are essentially correct.
\end{titlemize}
\begin{titlemize}{Additional Criteria}
    \item Answers are well motivated.
    \item Answers are well illustrated.
    \item Answers reveal extensive knowledge of the textbook chapter(s).
\end{titlemize}


\section{Submit the assignment}
%\todo[inline, color=blue!40]{FC: we should decide whether or not we
%  want them to submit in LaTeX}
\noindent
Please submit your assignment though studium as a PDF file without identifying
information (i.e., do not include your name in the report or the file name). It
should follow the style and margins given in the example submission, even if
not created with LaTeX. The deadlines can be found in studium.

%Submit your assignment as a pdf file named
%firstname\_lastname\_assignment\_2.pdf. It should follow the style and
%margins given in the example submission even if not created with
%LaTeX. See deadline on studentportalen.


%Submit your assignment as a pdf file named
%firstname\_lastname\_assignment\_2.pdf. It should follow the style and
%margins given in the example submission even if not created with
%LaTeX. The submission is due in \emph{Studentportalen} before
%Wednesday November 28th at 20h00. Later submissions will be considered
%failed submissions and assessed after the final re-submission deadline
%on January 11th.%\todo[inline, color=green!50]{FC: added submission format requirements}
% To pass the assignment, you must have answered both
% sections, reached at least 5 points in each section and at least 12
% overall. To get VG, you should obtain at least 8 in each section and
% at least 18 points overall.  \todo[inline, color=yellow!40]{FC: This
%   sentence is a bit redundant. They cannot get 18 points without
%   getting at least 8 points in each question anyway, right? Moreover,
%   I changed the wording here from 'questions' to 'sections' in order
%   to make it fit with the text in the introduction where we say that
%   each \textbf{section} is worth 10 points.} \todo[inline, color=blue!40]{FC: maybe
%   say ``must have answered \underbar{all} questions?''. Sometimes, the
%   large questions contain sub-questions.}  \todo[inline,
% color=blue!40]{FC: I added the re-submission policy here and will add
%   it to the webpage as well.}


  % \item Reflect on the importance of lemmatisation for your native
%  language and at least one foreign language you know.\\
%  \textbf{\textcolor{blue}{$\rightarrow$ [productivity $|$
%      morphological richness]}}
%\item How difficult do you estimate lemmatisation to be for your
%  native language (and why)? \\\textbf{\textcolor{blue}{$\rightarrow$
%      [ambiguity]}}
%\item What is an FST? What is it used for and why is it useful? (ca. 1/2 page)\\
%  \textbf{\textcolor{blue}{$\rightarrow$ [Technical background $|$
%      Read the book, esp. 3.4-3.7]}}
%\newsec{Illustrate an FST} %

%\newsec{Error analysis}{error}%

% \newsec{Summing up}%

% \begin{itemize}
% \item Why is it more difficult to tag morphologically rich
%   languages?\\\textbf{\textcolor{blue}{[Morphological Productivity
%       $\rightarrow$ Unknown words in tag lexicon]}}
% \item What are the possible advantages/disadvantages of first
%   performing lemmatisation and then POS-tagging and doing it vice
%   versa? \\\textbf{\textcolor{blue}{$\rightarrow$ [error propagation
%       $|$ reduce/enhance ambiguity $|$ out-of-the-box thinking]}}
% \end{itemize}

\begin{figure}[H]
  \begin{center}
    \resizebox{0.6\textwidth}{!}{
    \fbox{
\lstinputlisting{code/lemmatizer1.py}
}}
\caption{Skeleton code for a rule-based lemmatizer, taken from Lab 7.}
\end{center}
\end{figure}


\end{document}