Twitter CSV as input

a feasible function for converting Twitter dataset (Twitter CSV format converted by twarc2) to the acceptable format in preprocess.py might be necessary

```
def csv2csv(df):
    mydf = df[['id', 'author.id', 'author.username', 'text', 'created_at', 'entities.urls']].copy()
    rprw_col = ['referenced_tweets.retweeted.id', 'referenced_tweets.replied_to.id']

    def timeconverter(tw_id):
        return (int(tw_id) >> 22) / 1000
    mydf['timestamp'] = mydf['id'].apply(timeconverter)
    
    def urlconverter(url_dict):
        urls = []
        if url_dict is not np.nan:
            for url_dict in json.loads(url_dict):
                urls.append(url_dict.get('expanded_url'))
            urls = ','.join(urls)
        else:
            urls = None
        return urls
    mydf['entities.urls'] = mydf['entities.urls'].apply(urlconverter)
    
    def re_strl2vec(strlist):      
        retweet_list = []
        reply_list = []
        for tw_dict in strlist:
            retweet_id = np.nan
            reply_id = np.nan
            if tw_dict is not np.nan:
                tw_dict = json.loads(tw_dict)
                for tw in tw_dict:
                    if tw.get('type') == 'retweeted':
                        retweet_id = tw.get('id')
                    elif tw.get('type') == 'replied_to':
                        reply_id == tw.get('id')
            retweet_list.append(retweet_id)
            reply_list.append(reply_id)
        return retweet_list, reply_list
    retweet_list, reply_list = re_strl2vec(df['referenced_tweets'].to_list())
    mydf['repost_id'] = retweet_list
    mydf['reply_id'] = reply_list
    
    select_columns = ['id', 'author.id', 'author.username', 'repost_id', 'reply_id', 'text', 'created_at', 'entities.urls']
    column_names = ['message_id', 'user_id', 'username', 'repost_id', 'reply_id', 'message', 'timestamp', 'urls']
    mydf = mydf[select_columns]
    mydf.columns = column_names
    return mydf
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Twitter CSV as input #47

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Twitter CSV as input #47

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions