-
Notifications
You must be signed in to change notification settings - Fork 1
Getting the Wiki Data
Nitin Kishore Sai Samala edited this page Apr 13, 2017
·
3 revisions
ENGLISH WIKI
Mkdir data
Cd data
nohup wget https://dumps.wikimedia.org/enwiki/20170201/enwiki-20170201-pages-articles-multistream.xml.bz2 </dev/null &
Cd ~
Git clone https://github.com/attardi/wikiextractor
Cd wikiextractor
Python setup.py install --user
Cd ~/data
bzip2 -dk enwiki-20170201-pages-articles-multistream.xml.bz2
mkdir spanishWiki
nohup WikiExtractor.py -o English_Wiki/ --no-templates ~/data/enwiki-20170201-pages-articles-multistream.xml </dev/null &
Spanish WIKI
nohup wget https://dumps.wikimedia.org/eswiki/20170201/eswiki-20170201-pages-articles-multistream.xml.bz2 </dev/null &
bzip2 -dk eswiki-20170201-pages-articles-multistream.xml.bz2
cd wikiextractor
mkdir spanishWiki
nohup WikiExtractor.py -o spanishWiki/ --no-templates ~/eswiki-20170201-pages-articles-multistream.xml </dev/null &
French WIKI
nohup wget https://dumps.wikimedia.org/frwiki/20170401/frwiki-20170401-pages-articles-multistream.xml.bz2 </dev/null &
bzip2 -dk frwiki-20170401-pages-articles-multistream.xml.bz2
cd wikiextractor
mkdir French_Wiki
nohup WikiExtractor.py -o French_Wiki/ --no-templates ~/frwiki-20170401-pages-articles-multistream.xml </dev/null &
Italian WIKI
nohup wget https://dumps.wikimedia.org/itwiki/20170401/itwiki-20170401-pages-articles-multistream.xml.bz2 </dev/null &
bzip2 -dk itwiki-20170401-pages-articles-multistream.xml.bz2
cd wikiextractor
mkdir Italian_Wiki
nohup WikiExtractor.py -o Italian_Wiki/ --no-templates ~/itwiki-20170401-pages-articles-multistream.xml </dev/null &