Skip to content

Twitter CSV as input #47

@RIOZHU123

Description

@RIOZHU123

a feasible function for converting Twitter dataset (Twitter CSV format converted by twarc2) to the acceptable format in preprocess.py might be necessary

def csv2csv(df):
    mydf = df[['id', 'author.id', 'author.username', 'text', 'created_at', 'entities.urls']].copy()
    rprw_col = ['referenced_tweets.retweeted.id', 'referenced_tweets.replied_to.id']

    def timeconverter(tw_id):
        return (int(tw_id) >> 22) / 1000
    mydf['timestamp'] = mydf['id'].apply(timeconverter)
    
    def urlconverter(url_dict):
        urls = []
        if url_dict is not np.nan:
            for url_dict in json.loads(url_dict):
                urls.append(url_dict.get('expanded_url'))
            urls = ','.join(urls)
        else:
            urls = None
        return urls
    mydf['entities.urls'] = mydf['entities.urls'].apply(urlconverter)
    
    def re_strl2vec(strlist):      
        retweet_list = []
        reply_list = []
        for tw_dict in strlist:
            retweet_id = np.nan
            reply_id = np.nan
            if tw_dict is not np.nan:
                tw_dict = json.loads(tw_dict)
                for tw in tw_dict:
                    if tw.get('type') == 'retweeted':
                        retweet_id = tw.get('id')
                    elif tw.get('type') == 'replied_to':
                        reply_id == tw.get('id')
            retweet_list.append(retweet_id)
            reply_list.append(reply_id)
        return retweet_list, reply_list
    retweet_list, reply_list = re_strl2vec(df['referenced_tweets'].to_list())
    mydf['repost_id'] = retweet_list
    mydf['reply_id'] = reply_list
    
    select_columns = ['id', 'author.id', 'author.username', 'repost_id', 'reply_id', 'text', 'created_at', 'entities.urls']
    column_names = ['message_id', 'user_id', 'username', 'repost_id', 'reply_id', 'message', 'timestamp', 'urls']
    mydf = mydf[select_columns]
    mydf.columns = column_names
    return mydf

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions