OA256864 · hleborgne · Aug 26, 2021 · Aug 26, 2021 · Aug 27, 2021
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ conda install configparser
 
 ### Released Corpus
 The corpus is available at [this address](https://drive.google.com/open?id=1kkRpVJpo-U6Gt_r4Ly-ciq4pAY03CoTg) under the licence [CC BY-NC-SA 3.0](https://creativecommons.org/licenses/by-nc-sa/3.0/).
-We provide the identifiers of the tweets that were used in [1,2]. Due to the Twitter policy we do not release the full (*hydratated*) content : see [here](https://developer.twitter.com/en/developer-terms/more-on-restricted-use-cases) at *Redistribution of Twitter content*. We also provide a program to retrieve a Tweet content from its ID and convert it to the appropriate format. All the material is in the folder `corpus`:
+We provide the identifiers of the tweets that were used in [1,2]. Due to the Twitter policy we do not release the full (*hydratated*) content : see [here](https://developer.twitter.com/en/developer-terms/more-on-restricted-use-cases) at *Redistribution of Twitter content*. We also provide programs to retrieve the Tweet contents from its ID and recreate the dataset for evaluation. All the material is in the folder `corpus` (see the [README](corpus/README.md)):
 - `mel_train_ids` 35,976 ids of the training evaluation corpus 
 - `mel_dev_ids` 16,599 ids of the dev evaluation corpus 
 - `mel_test_ids` 36,521 ids of the test evaluation corpus 

diff --git a/corpus/README.md b/corpus/README.md
@@ -1,4 +1,4 @@
-# Download
+# Download and create dataset
 
 All the following material is released under the [licence CC-BY-NC-SA](https://creativecommons.org/licenses/by-nc-sa/3.0/)
 
@@ -10,7 +10,10 @@ In practice, you can run:
 wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1qYGTUJlkzyFbSTeGP-g1PVefkq8VgUwA' -O ambiguousUsers.db.gz
 wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1kkRpVJpo-U6Gt_r4Ly-ciq4pAY03CoTg' -O mel_dataset.tar.bz2
 ```
-Then modify the script `get_original_corpus.py` to process all the tweet as wanted. The current script only display the full text of the tweets and the images URL (and skip non existing tweets).
+
+The script `get_original_corpus.py` allows to retrieve the tweets from the tweet ids (the script can be modified to process the tweet as wanted). The twitter API credentials must be specified in the file `twitterAPI.credentials`. The current script outputs the full text of the tweets and the images URL (and skip non existing tweets) and, in case of the kb, the screen name of its author (to get its timeline).
+The download of the images from the URLs in not included in the script: it must be performed separately.
+
 ```
 conda create --name mael python=3.8
 conda activate mael
@@ -19,7 +22,31 @@ pip install tweepy
 tar xjf mel_dataset.tar.bz2
 python get_original_corpus.py
 ```
-Finaly, let download the images from their URLs.
+
+Then, the script `replace_ambiguous_mentions.py` creates the evaluation dataset by replacing the screen names by ambiguous mentions, according to the data from `ambiguousUsers.db`.
+
+Let `gunzip ambiguousUsers.db.gz` then  create the screennames/mention mapping file with `sqlite3 ambiguousUsers.db` and:
+```
+sqlite> .output mapSreenNameToMention.txt
+sqlite> select ('@'||userScreenName),userSearchQueryLasttName from twitterUsers;
+```
+
+and then replace the mentions:
+
+```
+python replace_ambiguous_mentions.py mapSreenNameToMention.txt train_tweets.txt > train.txt
+```
+
+The output format is: one line per tweet, with the following tab-separated fields:
+
+- tweet id
+- start position of mention to disambiguate
+- end position of mention to disambiguate
+- text of mention to disambiguate
+- disambiguated screen name
+- text of tweet
+- url of associated image
+
 
 # md5sum
 

diff --git a/corpus/get_original_corpus.py b/corpus/get_original_corpus.py
@@ -1,40 +1,98 @@
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
+# copyright  Copyright (C) 2021 by CEA - LIST
+#
 
-# Possible environment:
-#   conda create --name mael python=3.8
-#   conda activate mael
-#   pip install tweepy
 import tweepy
 from tweepy import OAuthHandler
+import argparse,sys,os
+
+#----------------------------------------------------------------------
+# Twitter API credentials
+credentials={}
+try:
+    with open(os.path.join(os.path.dirname(__file__),"twitterAPI.credentials")) as fcred:
+        for line in fcred:
+            name,value=line.rstrip().split("=")
+            credentials[name.strip()]=value.strip()
+except IOError:
+    sys.stderr.write("Error: failed to open file twitterAPI.credentials\n")
+    exit(1)
+
+try:
+    auth = tweepy.OAuthHandler(credentials["CONSUMER_KEY"],
+                               credentials["CONSUMER_SECRET"])
+    auth.set_access_token(credentials["ACCESS_KEY"],
+                          credentials["ACCESS_KEY_SECRET"])
+    api = tweepy.API(auth)
+except Exception as e:
+    sys.stderr.write("Error: twitter authentification failed: {}\n".format(str(e)))
+    exit(1)
+#----------------------------------------------------------------------
 
-# main process
-def process_list_ids(ids_filename)
-    f_ids = open(ids_filename,'r')
-    for ii in f_ids.readlines():
+def process_list_ids(fids, fout=sys.stdout, output_author=False, sample=None):
+    tweetids=fids.readlines()
+    if sample:
+        tweetids=tweetids[:sample]
+    for ii in tweetids:
         id_of_tweet = ii.strip() # e.g id_of_tweet='1065824565706686464'
-        print("get tweet {}".format(id_of_tweet))
         try:
-            tweet = api.get_status(id_of_tweet, tweet_mode="extended")
             # TODO process [text] and [img URL] as you want
             # you need to dowload the image afterward
-            print("\t\t[text]: {}".format(tweet.full_text))
-            print("\t\t[img url]: {}".format(tweet.entities['media'][0]['media_url']))
+
+            # here, simply output the tweet id, tweet content and image url
+
+            tweet = api.get_status(id_of_tweet, tweet_mode="extended")
+            content=tweet.full_text.replace("\t","    ").replace("\n"," ")
+            img=tweet.entities['media'][0]['media_url']
+            if not img:
+                sys.stderr.write("--Warning: no image found for tweet id {}\n".format(id_of_tweet))
+
+            if output_author:
+                fout.write("\t".join([tweet.author.screen_name,id_of_tweet,content,img])+"\n")
+            else:
+                fout.write("\t".join([id_of_tweet,content,img])+"\n")
+
         except tweepy.TweepError as e:
-            print("\t\terror code {} message: {}".format(e.args[0][0]['code'],e.args[0][0]['message']))
-    f_ids.close()
-
-#Twitter API credentials (TODO set your own from your Twitter developper account)
-CONSUMER_KEY = "xxxx"
-CONSUMER_SECRET = "yyyy"
-ACCESS_KEY = "zzz"
-ACCESS_KEY_SECRET = "www"
-
-auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
-auth.set_access_token(ACCESS_KEY, ACCESS_KEY_SECRET)
-api = tweepy.API(auth)
-
-# process all ids of the training/validation/test dataset
-process_list_ids('mel_train_ids')
-process_list_ids('mel_dev_ids')
-process_list_ids('mel_test_ids')
+            sys.stderr.write("--Error on tweet {}: code {}, message: {}\n".format(id_of_tweet,e.args[0][0]['code'],e.args[0][0]['message']))
+
+#----------------------------------------------------------------------
+# main function
+def main(argv):
+    # parse command-line arguments
+    parser=argparse.ArgumentParser(description="get tweets from a list of tweet ids, using twitter API")
+    # optional arguments
+    parser.add_argument("--author",action="store_true",help="output the screen name of the author of the tweet (to get the timeline tweets in the KB)")
+    parser.add_argument("--sample",type=int,help="get only the first X tweets (for test)")
+    # positional arguments
+    parser.add_argument("input_file",nargs="*",type=argparse.FileType('r',encoding='UTF-8'),help="the file containing the list of tweet ids, one per line")
+
+    param=parser.parse_args()
+
+    # do main
+    if len(param.input_file):
+        for f in param.input_file:
+            process_list_ids(f,output_author=param.author,sample=param.sample)
+    else:
+        # use default names  
+        # get mel_train_ids, mel_test_ids, mel_dev_ids
+        for f in ("train","dev","test"):
+            try:
+                filename="mel_%s_ids"%f
+                with open(filename) as fin:
+                    with open("%s_tweets.txt"%f,"w") as fout:
+                        process_list_ids(fin,fout,sample=param.sample)
+            except IOError:
+                sys.stderr.write("Error: failed to open file %s\n"%filename)
+        # get kb
+        try:
+            with open("kb") as fin:
+                with open("timeline_KB.txt","w") as fout:
+                    process_list_ids(fin,fout,output_author=True,sample=param.sample)
+        except IOError:
+            sys.stderr.write("Error: failed to open file %s\n"%f)
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
+
diff --git a/corpus/replace_ambiguous_mentions.py b/corpus/replace_ambiguous_mentions.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# copyright  Copyright (C) 2021 by CEA - LIST
+#
+
+import sys,argparse
+import re
+from collections import defaultdict
+
+#----------------------------------------------------------------------
+
+def replace_ambiguous(screenname2mention,f):
+    for line in f:
+        try:
+            tweetid,content,img=line.rstrip().split("\t")
+        except ValueError:
+            sys.stderr.write("--Warning: incomplete line for tweet {}\n".format(tweetid))
+            continue
+
+        found=False
+        for sn in re.finditer("@\w+",content):
+            name=sn.group(0)
+            if name in screenname2mention:
+                mention=screenname2mention[name]
+                text=content.replace(name,mention)
+                pos_begin=sn.start()
+                pos_end=pos_begin+len(mention)
+                print("\t".join(map(str,[tweetid,pos_begin,pos_end,mention,name,text,img])))
+                found=True
+                continue
+        if not found:
+            sys.stderr.write("--Warning: no ambiguous screen name found for tweet {}\n".format(tweetid))
+            continue
+
+
+#----------------------------------------------------------------------
+# main function
+def main(argv):
+    # parse command-line arguments
+    parser=argparse.ArgumentParser(description="")
+    # optional arguments
+    #parser.add_argument("--arg",type=int,default=42,help="description")
+    # positional arguments
+    parser.add_argument("ambiguous",type=argparse.FileType('r',encoding='UTF-8'),help="the file containing the ambiguous mentions with their possible screen names")
+    parser.add_argument("tweets_file",type=argparse.FileType('r',encoding='UTF-8'),help="a file containing tweets in which some ambiguous mentions are present")
+
+    param=parser.parse_args()
+
+    # do main
+    # get mapping for ambiguous mentions
+    screenname2mention= defaultdict(str)
+    for mm in param.ambiguous.readlines():
+        sn,mention = mm.rstrip().split('|')
+        screenname2mention[sn] = mention
+
+    replace_ambiguous(screenname2mention,param.tweets_file)
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/corpus/twitterAPI.credentials b/corpus/twitterAPI.credentials
@@ -0,0 +1,4 @@
+CONSUMER_KEY=xxx
+CONSUMER_SECRET=yyy
+ACCESS_KEY=zzz
+ACCESS_KEY_SECRET=ttt