-
Notifications
You must be signed in to change notification settings - Fork 14
Open
Description
a feasible function for converting Twitter dataset (Twitter CSV format converted by twarc2) to the acceptable format in preprocess.py might be necessary
def csv2csv(df):
mydf = df[['id', 'author.id', 'author.username', 'text', 'created_at', 'entities.urls']].copy()
rprw_col = ['referenced_tweets.retweeted.id', 'referenced_tweets.replied_to.id']
def timeconverter(tw_id):
return (int(tw_id) >> 22) / 1000
mydf['timestamp'] = mydf['id'].apply(timeconverter)
def urlconverter(url_dict):
urls = []
if url_dict is not np.nan:
for url_dict in json.loads(url_dict):
urls.append(url_dict.get('expanded_url'))
urls = ','.join(urls)
else:
urls = None
return urls
mydf['entities.urls'] = mydf['entities.urls'].apply(urlconverter)
def re_strl2vec(strlist):
retweet_list = []
reply_list = []
for tw_dict in strlist:
retweet_id = np.nan
reply_id = np.nan
if tw_dict is not np.nan:
tw_dict = json.loads(tw_dict)
for tw in tw_dict:
if tw.get('type') == 'retweeted':
retweet_id = tw.get('id')
elif tw.get('type') == 'replied_to':
reply_id == tw.get('id')
retweet_list.append(retweet_id)
reply_list.append(reply_id)
return retweet_list, reply_list
retweet_list, reply_list = re_strl2vec(df['referenced_tweets'].to_list())
mydf['repost_id'] = retweet_list
mydf['reply_id'] = reply_list
select_columns = ['id', 'author.id', 'author.username', 'repost_id', 'reply_id', 'text', 'created_at', 'entities.urls']
column_names = ['message_id', 'user_id', 'username', 'repost_id', 'reply_id', 'message', 'timestamp', 'urls']
mydf = mydf[select_columns]
mydf.columns = column_names
return mydf
Metadata
Metadata
Assignees
Labels
No labels