-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNLP-Advanced-Text-Processing-Ass.tex
294 lines (252 loc) · 12 KB
/
NLP-Advanced-Text-Processing-Ass.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
\documentclass[11pt]{article}
%\usepackage[top=20mm,left=20mm,right=20mm,bottom=15mm,a4paper]{geometry} % see geometry.pdf on how to lay out the page. There's lots.
\usepackage[top=20mm,left=20mm,right=20mm,bottom=15mm,headsep=15pt,footskip=15pt,a4paper]{geometry} % see geometry.pdf on how to lay out the page. There's lots.
%\geometry{a4paper} % or letter or a5paper or ... etc
% \geometry{landscape} % rotated page geometry
\usepackage[round]{natbib}
\setlength{\bibsep}{0.0pt}
\usepackage{color}
\usepackage{times}
%\usepackage[T1]{fontenc}
%\usepackage{mathptmx}
\usepackage{tikz-dependency}
\usepackage{enumitem}
%\usepackage{times}
\usepackage{here}
\usepackage[procnames]{listings}
\usepackage{color}
\usepackage{todonotes}
% See the ``Article customise'' template for come common customisations
\newcommand{\refeq}[1]{Equation~\ref{eq:#1}}
\newcommand{\reffig}[1]{Figure~\ref{fig:#1}}
\newcommand{\reftab}[1]{Table~\ref{tab:#1}}
\newcommand{\refsec}[1]{\textsection\ref{sec:#1}}
\newcommand{\newsec}[1]{\section{#1}\noindent}
%\newcommand{\newsec}[2]{\section{#1}\label{sec:#2}\noindent}
\newcommand{\newsubsec}[2]{\subsection{#1}\label{sec:#2}\noindent}
\newcommand{\argmax}{\operatornamewithlimits{argmax}}
\newcommand{\argmin}{\operatornamewithlimits{argmin}}
\makeatletter
\def\@maketitle{ % custom maketitle
\begin{center}%
{\bfseries \@title}%
{\bfseries \@author}%
\end{center}%
\smallskip \hrule \bigskip }
% custom section
\renewcommand{\section}{\@startsection
{section}% % the name
{1}% % the level
{0mm}% % the indent
{-0.8\baselineskip}% % the before skip
{0.3\baselineskip}% % the after skip
{\bfseries\large}}% the style
% custom subsection
\renewcommand{\subsection}{\@startsection
{subsection}% % the name
{2}% % the level
{0mm}% % the indent
{-0.8\baselineskip}% % the before skip
{0.3\baselineskip}% % the after skip
{\bfseries\large}}% the style
\renewcommand{\paragraph}{%
\@startsection{paragraph}{4}%
{\z@}{1.5ex \@plus 1ex \@minus .2ex}{-1em}%
{\normalfont\normalsize\bfseries}%
}\makeatother
% taken from MdL
\newenvironment{titlemize}[1]{%
\paragraph{#1}
\begin{itemize}
\setlength\itemsep{0pt}}
{\end{itemize}}
%\title{{\LARGE Universal Parser (UP)}\\[-8mm]
%\includegraphics[height=8mm]{RUPA}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\includegraphics[height=8mm]{RUPA}}
\title{{\LARGE Natural Language Processing}\\[1.5mm]{\large Assignment 2: Advanced Text Processing}}%\\Lemmatisation and Part-of-Speech-Tagging}}
\author{}
\date{} % delete this line to display the current date
%%% BEGIN DOCUMENT
\begin{document}
\definecolor{keywords}{RGB}{255,0,90}
\definecolor{comments}{RGB}{0,0,113}
\definecolor{red}{RGB}{160,0,0}
\definecolor{green}{RGB}{0,150,0}
\definecolor{UUlight}{RGB}{230,230,230}
\definecolor{UUmedium}{RGB}{190,190,190}
\definecolor{UUdark}{RGB}{130,130,130}
\definecolor{UUred}{RGB}{153,0,0}
\lstset{language=Python,
basicstyle=\ttfamily\small,
keywordstyle=\color{keywords},
commentstyle=\color{comments},
stringstyle=\color{red},
showstringspaces=false,
identifierstyle=\color{green},
procnamekeys={def,class}}
\maketitle
%\tableofcontents
%\vspace{3mm}
\section{Introduction}
\noindent This assignment involves material from lectures 5, 6 and 7. You should
have watched the relevant videos, read the relevant chapters in the textbook and
made a serious attempt at completing the relevant labs before you attempt this
assignment. If you feel you have done that and still find the instructions
unclear, you are welcome to email the course teachers and/or go to office hours
to ask for help. The assignment is split into 2 sections, one about POS tagging,
one about Lemmatisation. Each section is worth 10 points. If you are interested
in receiving a VG grade, you may complete the \textit{optional} exercise
available to you, which is related to Hidden Markov Models. Note that the exercise
must be completed in full in order to receive VG credit. We expect between half
a page and a page for each section, except when stated otherwise. Please do not
submit more than 5 pages overall. Your answers for each section should be
self-contained. %\todo[inline,
%color=blue!40]{FC: Right now, the assignment consists of many smaller
% questions. Can/Should we make them fewer bigger questions?}
\newsec{POS-Tagging}%
\begin{itemize}
\item In Lab 5, you have tuned a tagger. Based on the best version of
your tagger, you should perform a manual error analysis where you go
through at least 5 sentences and comment on the errors made by the
tagger. Are the mistagged words genuinely ambiguous? Why do you
think they were mistagged? Is it possible that some of the words are
mistagged in the gold standard? \textcolor{UUred}{[ca 1/2 page]}.\footnote{If you need more information about the tagset, go to http://universaldependencies.org/u/pos/index.html.}
\item What tagsets exist for your native language? List the ones you
can find (spend not more than 15min on the search) and describe one
of them in more detail.
% \textcolor{UUred}{[$>$ 10
% sentences]} %\\\textbf{\textcolor{blue}{[tagsets $|$
%own web search]}}
\item Is it necessary to tokenize text before tagging it? Please
motivate your answer and give at least one
example.% \textcolor{UUred}{[$>$ 10
% sentences]} %\\\textbf{\textcolor{blue}{[Read the book]}}
\item In the HMM lab we have investigated key sequences and predicted words.
What do these correspond to when using HMMs for POS-tagging? Please motivate your answer. %\textcolor{UUred}{[$>$ 10
% sentences]}
%\\\textbf{\textcolor{blue}{[HMMs $|$ Back-ref to lab]}}
\end{itemize}
%\todo[inline, color=blue!40]{FC: We should discuss whether it is OK to
% keep these individual recommendations for answer lengths. I saw that
% I am the only one using it.}
\newsec{Lemmatisation}%
%What are they supposed to do?
\begin{itemize}
\item In Lab 7, you have tuned a lemmatizer. Based on the best version
of your lemmatizer, you should do a manual analysis of remaining
errors. Describe at least 5 error types and discuss how they could
be tackled in a more sophisticated lemmatizer. \textcolor{UUred}{[ca
1/2 page]}
\item Lemmatizers are often implemented as finite-state-transducers
(FSTs). While this kind of implementation is beyond the scope of
this course, Chapter 3.5 (FSTs for Morphological Parsing) of our
course book gives examples of how FSTs can be visualised. Draw an
FST based on the initial lemmatizer we gave you in Lab 7 (repeated
on the last page of this assignment) that can analyse the following
words: \texttt{cats NOUN}, \texttt{jumped VERB}, \texttt{higher
ADJ}. You can \textbf{either} draw the FST by hand, take a picture, and
transform it to \texttt{.pdf}, \textbf{or} use a drawing program
(e.g., xfig or MS Paint) and transform the output into
\texttt{.pdf}. Regardless of how you produce the drawing, it must be included in your submission and
\textbf{not} be submitted as a separate file. Please describe
your drawing. %\textcolor{UUred}{[$>$ 10 sentences]}
\item Why is it more difficult to tag morphologically rich languages?
Please reflect and motivate your answer. %\textcolor{UUred}{[min. 10
% sentences]}
\end{itemize}
\clearpage
\newsec{VG: Hidden Markov Models}
Consider the following \texttt{TRAIN} ``corpus'':
\begin{verbatim}
she/PRON books/VERB trips/NOUN for/ADP kids/NOUN
she/PRON kids/VERB you/PRON all/DET the/DET time/NOUN
kids/NOUN like/VERB books/NOUN like/ADP these/PRON
\end{verbatim}
\noindent Compute the transition and emission probabilities of a 1st order Hidden Markov
Model POS-tagger using this data. Show your results in separate tables.
\\
\\
\noindent Now, consider the following \texttt{TEST} sentence:
\begin{verbatim}
kids like books
\end{verbatim}
\noindent Report the most likely tag sequence for this sentence, as well as its
probability given the \texttt{TRAIN} corpus. Show your work by drawing a Viterbi
trellis.
\section{Grading Criteria}
To pass the assignment, you must meet all the basic criteria on all
subparts of the assignment. To get VG, you must in addition meet some
of the additional criteria for most of subparts.
\begin{titlemize}{Basic Criteria}
\item Answers are given in understandable English.
\item Answers are stated clearly and coherently.
\item Answers are essentially correct.
\end{titlemize}
\begin{titlemize}{Additional Criteria}
\item Answers are well motivated.
\item Answers are well illustrated.
\item Answers reveal extensive knowledge of the textbook chapter(s).
\end{titlemize}
\section{Submit the assignment}
%\todo[inline, color=blue!40]{FC: we should decide whether or not we
% want them to submit in LaTeX}
\noindent
Please submit your assignment though studium as a PDF file without identifying
information (i.e., do not include your name in the report or the file name). It
should follow the style and margins given in the example submission, even if
not created with LaTeX. The deadlines can be found in studium.
%Submit your assignment as a pdf file named
%firstname\_lastname\_assignment\_2.pdf. It should follow the style and
%margins given in the example submission even if not created with
%LaTeX. See deadline on studentportalen.
%Submit your assignment as a pdf file named
%firstname\_lastname\_assignment\_2.pdf. It should follow the style and
%margins given in the example submission even if not created with
%LaTeX. The submission is due in \emph{Studentportalen} before
%Wednesday November 28th at 20h00. Later submissions will be considered
%failed submissions and assessed after the final re-submission deadline
%on January 11th.%\todo[inline, color=green!50]{FC: added submission format requirements}
% To pass the assignment, you must have answered both
% sections, reached at least 5 points in each section and at least 12
% overall. To get VG, you should obtain at least 8 in each section and
% at least 18 points overall. \todo[inline, color=yellow!40]{FC: This
% sentence is a bit redundant. They cannot get 18 points without
% getting at least 8 points in each question anyway, right? Moreover,
% I changed the wording here from 'questions' to 'sections' in order
% to make it fit with the text in the introduction where we say that
% each \textbf{section} is worth 10 points.} \todo[inline, color=blue!40]{FC: maybe
% say ``must have answered \underbar{all} questions?''. Sometimes, the
% large questions contain sub-questions.} \todo[inline,
% color=blue!40]{FC: I added the re-submission policy here and will add
% it to the webpage as well.}
% \item Reflect on the importance of lemmatisation for your native
% language and at least one foreign language you know.\\
% \textbf{\textcolor{blue}{$\rightarrow$ [productivity $|$
% morphological richness]}}
%\item How difficult do you estimate lemmatisation to be for your
% native language (and why)? \\\textbf{\textcolor{blue}{$\rightarrow$
% [ambiguity]}}
%\item What is an FST? What is it used for and why is it useful? (ca. 1/2 page)\\
% \textbf{\textcolor{blue}{$\rightarrow$ [Technical background $|$
% Read the book, esp. 3.4-3.7]}}
%\newsec{Illustrate an FST} %
%\newsec{Error analysis}{error}%
% \newsec{Summing up}%
% \begin{itemize}
% \item Why is it more difficult to tag morphologically rich
% languages?\\\textbf{\textcolor{blue}{[Morphological Productivity
% $\rightarrow$ Unknown words in tag lexicon]}}
% \item What are the possible advantages/disadvantages of first
% performing lemmatisation and then POS-tagging and doing it vice
% versa? \\\textbf{\textcolor{blue}{$\rightarrow$ [error propagation
% $|$ reduce/enhance ambiguity $|$ out-of-the-box thinking]}}
% \end{itemize}
\begin{figure}[H]
\begin{center}
\resizebox{0.6\textwidth}{!}{
\fbox{
\lstinputlisting{code/lemmatizer1.py}
}}
\caption{Skeleton code for a rule-based lemmatizer, taken from Lab 7.}
\end{center}
\end{figure}
\end{document}