Skip to content

Commit 7d358e2

Browse files
committed
Add MT samples (#716).
1 parent 6d2d038 commit 7d358e2

File tree

326 files changed

+633369
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

326 files changed

+633369
-1
lines changed
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<teiCorpus xmlns="http://www.tei-c.org/ns/1.0"
3+
xml:lang="en"
4+
xml:id="ParlaMint-AT-en.ana"
5+
corresp="../ParlaMint-AT.TEI.ana/ParlaMint-AT.ana.xml">
6+
<teiHeader>
7+
<fileDesc>
8+
<titleStmt>
9+
<title xml:lang="de" type="main">Österreichisches Parlamentskorpus ParlaMint-AT-en [ParlaMint-en.ana SAMPLE]</title>
10+
<title xml:lang="en" type="main">Austrian parliamentary corpus ParlaMint-AT-en [ParlaMint-en.ana SAMPLE]</title>
11+
<title xml:lang="de" type="sub">Stenographische Protokolle der Plenarsitzungen des Österreichischen Nationalrats, XX. Gesetzgebungsberiode - XXVII. Gesetzgebungsperiode (1996 - 2022)</title>
12+
<title xml:lang="en" type="sub">Shorthand records of the plenary sittings of the National Council of the Austrian parliament, terms 20 - terms 27 (1996 - 2022)</title>
13+
<meeting n="27" corresp="#NR" ana="#parla.lower #parla.term #NR.XXVII"/>
14+
<meeting n="26" corresp="#NR" ana="#parla.lower #parla.term #NR.XXVI"/>
15+
<meeting n="25" corresp="#NR" ana="#parla.lower #parla.term #NR.XXV"/>
16+
<meeting n="24" corresp="#NR" ana="#parla.lower #parla.term #NR.XXIV"/>
17+
<meeting n="23" corresp="#NR" ana="#parla.lower #parla.term #NR.XXIII"/>
18+
<meeting n="22" corresp="#NR" ana="#parla.lower #parla.term #NR.XXII"/>
19+
<meeting n="21" corresp="#NR" ana="#parla.lower #parla.term #NR.XXI"/>
20+
<meeting n="20" corresp="#NR" ana="#parla.lower #parla.term #NR.XX"/>
21+
<respStmt>
22+
<persName ref="https://orcid.org/0000-0002-8111-5584">Hannes Pirker</persName>
23+
<persName ref="https://orcid.org/0000-0003-2436-0361">Daniel Schopper</persName>
24+
<persName ref="https://orcid.org/0000-0002-1631-4560">Tanja Wissik</persName>
25+
<resp xml:lang="de">Projektplanung und Methode</resp>
26+
<resp xml:lang="en">Project set-up and methodology</resp>
27+
</respStmt>
28+
<respStmt>
29+
<persName>Hannes Pirker</persName>
30+
<resp xml:lang="de">Datenbeschaffung, Korpuskodierung in TEI und automatische linguistische Annotation</resp>
31+
<resp xml:lang="en">Data retrieval, TEI corpus encoding and automatic linguistic annotation</resp>
32+
</respStmt>
33+
<respStmt>
34+
<persName>Daniel Schopper</persName>
35+
<resp xml:lang="de">XSLT Transformationen</resp>
36+
<resp xml:lang="en">XSLT transformations</resp>
37+
</respStmt>
38+
<respStmt>
39+
<persName>Martin Kirnbauer</persName>
40+
<resp xml:lang="de">Einige der manuellen Korrekturen</resp>
41+
<resp xml:lang="en">Some of the manual curation</resp>
42+
</respStmt>
43+
<respStmt>
44+
<persName>Tanja Wissik</persName>
45+
<resp xml:lang="de">Metadaten und Übersetzung</resp>
46+
<resp xml:lang="en">Metadata and translation</resp>
47+
</respStmt>
48+
<respStmt>
49+
<persName>Taja Kuzman</persName>
50+
<persName>Nikola Ljubešić</persName>
51+
<resp xml:lang="en">Machine translation to English and linguistic analysis of the translation</resp>
52+
</respStmt>
53+
<funder>
54+
<orgName xml:lang="de">CLARIN-ERIC</orgName>
55+
<orgName xml:lang="en">CLARIN-ERIC (Common Language Resources and Technology Infrastructure—European Research Infrastructure Consortium)</orgName>
56+
<ref target="https://www.clarin.eu/">www.clarin.eu</ref>
57+
</funder>
58+
<funder>
59+
<orgName xml:lang="de">ÖAW (Österreichische Akademie der Wissenschaften)</orgName>
60+
<orgName xml:lang="en">ÖAW (Austrian Academy of Sciences)</orgName>
61+
<ref target="https://www.oeaw.ac.at/">www.oeaw.ac.at</ref>
62+
</funder>
63+
</titleStmt>
64+
<editionStmt>
65+
<edition>3.0</edition>
66+
</editionStmt>
67+
<extent><!--These numbers do not reflect the size of the sample!-->
68+
<measure unit="speeches" quantity="227991" xml:lang="en">227,991 speeches</measure>
69+
<measure unit="words" quantity="63932213" xml:lang="en">63,932,213 words</measure>
70+
</extent>
71+
<publicationStmt>
72+
<publisher>
73+
<orgName xml:lang="de">Die CLARIN Forschungsinfrastruktur</orgName>
74+
<orgName xml:lang="en">The CLARIN research infrastructure</orgName>
75+
<ref target="https://www.clarin.eu/">www.clarin.eu</ref>
76+
</publisher>
77+
<idno type="URI" subtype="handle">http://hdl.handle.net/11356/1810</idno>
78+
<availability status="free">
79+
<licence>http://creativecommons.org/licenses/by/4.0/</licence>
80+
<p xml:lang="de">Dieses Werk ist lizensiert unter der <ref target="http://creativecommons.org/licenses/by/4.0/">Creative Commons Namensnennung 4.0 International Lizenz (CC BY 4.0)</ref>.</p>
81+
<p xml:lang="en">This work is licensed under the <ref target="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ref>.</p>
82+
</availability>
83+
<date when="2023-06-24">2023-06-24</date>
84+
</publicationStmt>
85+
<sourceDesc>
86+
<bibl>
87+
<title type="main" xml:lang="de">Stenographische Protokolle der Plenarsitzungen des Nationalrats der Republik Österreich</title>
88+
<title type="main" xml:lang="en">Shorthand records of the plenary sittings of the National Council of the Austrian parliament</title>
89+
<publisher>Parlamentsdirektion</publisher>
90+
<idno type="URI" subtype="parliament">https://www.parlament.gv.at/PAKT/STPROT</idno>
91+
<date from="1996-01-15" to="2022-05-19">15.01.1996 - 19.05.2022</date>
92+
</bibl>
93+
</sourceDesc>
94+
</fileDesc>
95+
<encodingDesc>
96+
<projectDesc>
97+
<p xml:lang="en">
98+
<ref target="https://www.clarin.eu/content/parlamint">ParlaMint</ref> is a project that aims to (1) create a multilingual set of comparable corpora of parliamentary proceedings uniformly encoded according to the <ref target="https://clarin-eric.github.io/ParlaMint/">ParlaMint encoding guidelines</ref>, covering the period from 2015 to mid-2022; (2) add linguistic annotations to the corpora and machine-translate them to English; (3) make the corpora available through concordancers; and (4) build use cases in Political Sciences and Digital Humanities based on the corpus data.</p>
99+
<p xml:lang="de">
100+
<ref target="https://www.clarin.eu/content/parlamint">ParlaMint</ref>
101+
</p>
102+
</projectDesc>
103+
<editorialDecl>
104+
<correction>
105+
<p>No correction of source texts was performed.</p>
106+
</correction>
107+
<normalization>
108+
<p>Text has not been normalised, except for spacing. Printed matter quoted in the protocols was removed</p>
109+
</normalization>
110+
<hyphenation>
111+
<p>No end-of-line hyphens were present in the source.</p>
112+
</hyphenation>
113+
<quotation>
114+
<p>Quotation marks have been left in the text and are not explicitly marked up.</p>
115+
</quotation>
116+
<segmentation>
117+
<p>The texts are segmented into utterances (speeches) and segments (corresponding to paragraphs in the source transcription).</p>
118+
</segmentation>
119+
</editorialDecl>
120+
<tagsDecl><!--These numbers do not reflect the size of the sample!-->
121+
<namespace name="http://www.tei-c.org/ns/1.0">
122+
<tagUsage gi="body" occurs="1197"/>
123+
<tagUsage gi="desc" occurs="346176"/>
124+
<tagUsage gi="div" occurs="1197"/>
125+
<tagUsage gi="gap" occurs="14864"/>
126+
<tagUsage gi="kinesic" occurs="248593"/>
127+
<tagUsage gi="name" occurs="2100135"/>
128+
<tagUsage gi="note" occurs="668625"/>
129+
<tagUsage gi="pb" occurs="116531"/>
130+
<tagUsage gi="pc" occurs="9280443"/>
131+
<tagUsage gi="s" occurs="3919672"/>
132+
<tagUsage gi="seg" occurs="662401"/>
133+
<tagUsage gi="text" occurs="1197"/>
134+
<tagUsage gi="u" occurs="227991"/>
135+
<tagUsage gi="vocal" occurs="82719"/>
136+
<tagUsage gi="w" occurs="63932213"/>
137+
</namespace>
138+
</tagsDecl>
139+
<classDecl>
140+
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
141+
href="ParlaMint-taxonomy-parla.legislature.xml"/>
142+
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
143+
href="ParlaMint-taxonomy-speaker_types.xml"/>
144+
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
145+
href="ParlaMint-taxonomy-subcorpus.xml"/>
146+
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
147+
href="ParlaMint-taxonomy-NER.ana.xml"/>
148+
</classDecl>
149+
<appInfo>
150+
<application ident="EasyNMT" version="2.0">
151+
<label>EasyNMT (OPUS-MT model)</label>
152+
<desc>Translation to English done with EasyNMT (<ref target="https://github.com/UKPLab/EasyNMT">https://github.com/UKPLab/EasyNMT</ref>) with OPUS-MT model gmw (<ref target="https://github.com/Helsinki-NLP/Opus-MT">https://github.com/Helsinki-NLP/Opus-MT</ref>)</desc>
153+
</application>
154+
<application ident="Stanza" version="1.5">
155+
<label>Stanza</label>
156+
<desc>Tokenisation, PoS tagging, lemmatization, and NER annotation done with Stanza (<ref target="https://stanfordnlp.github.io/stanza/">https://stanfordnlp.github.io/stanza/</ref>) with the model for English. For NER the conll03 model with 4 NE classes was used.</desc>
157+
</application>
158+
</appInfo>
159+
</encodingDesc>
160+
<profileDesc>
161+
<settingDesc>
162+
<setting>
163+
<name type="city" xml:lang="de">Wien</name>
164+
<name type="city" xml:lang="en">Vienna</name>
165+
<name type="country" xml:lang="de" key="AT">Österreich</name>
166+
<name type="country" xml:lang="en" key="AT">Austria</name>
167+
<date from="1996-01-15" to="2022-04-27"/>
168+
</setting>
169+
</settingDesc>
170+
<textClass>
171+
<catRef scheme="#ParlaMint-taxonomy-parla.legislature"
172+
target="#parla.bi #parla.lower"/>
173+
</textClass>
174+
<particDesc>
175+
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="ParlaMint-AT-listOrg.xml"/>
176+
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
177+
href="ParlaMint-AT-listPerson.xml"/>
178+
</particDesc>
179+
<langUsage>
180+
<language ident="de" xml:lang="de">Deutsch</language>
181+
<language ident="de" xml:lang="en">German</language>
182+
<language ident="en" xml:lang="de">Englisch</language>
183+
<language ident="en" xml:lang="en">English</language>
184+
</langUsage>
185+
</profileDesc>
186+
<revisionDesc>
187+
<change when="2023-06-24">
188+
<name>Tomaž Erjavec</name>: Made sample.</change>
189+
<change when="2023-06-24">parlamint2release script: Fix some identifiable erros for the release.</change>
190+
<change when="2023-06-23">
191+
<name>Tomaž Erjavec</name>: Generate TEI version of MTed corpus.</change>
192+
<change when="2023-06-24">parlamint-add-common-content script: Adding common content.</change>
193+
</revisionDesc>
194+
</teiHeader>
195+
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
196+
href="ParlaMint-AT-en_2005-03-31-022-XXII-NRSITZ-00100.ana.xml"/>
197+
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
198+
href="ParlaMint-AT-en_2014-09-24-025-XXV-NRSITZ-00042.ana.xml"/>
199+
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
200+
href="ParlaMint-AT-en_2022-05-19-027-XXVII-NRSITZ-00159.ana.xml"/>
201+
</teiCorpus>
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
ID Title Date Body Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_MP Speaker_Minister Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth
2+
ParlaMint-AT_2005-03-31-022-XXII-NRSITZ-00100_d7e355 Sitting Number 100, Legislative period XXII, Thursday, 31. March 2005 2005-03-31 Lower house 22 100 Reference Chairperson MP - FPÖ Freiheitlicher Parlamentsklub Prinzhorn, Thomas M 1943
3+
ParlaMint-AT_2005-03-31-022-XXII-NRSITZ-00100_d7e386 Sitting Number 100, Legislative period XXII, Thursday, 31. March 2005 2005-03-31 Lower house 22 100 Reference Chairperson MP - FPÖ Freiheitlicher Parlamentsklub Prinzhorn, Thomas M 1943

0 commit comments

Comments
 (0)