{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "16feb8b8-581d-4bec-983a-68b858622696", "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "682c6e18-4f1f-454f-88ba-389f999b5974", "metadata": { "tags": [] }, "outputs": [], "source": [ "ira1 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_1.csv')\n", "ira2 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_2.csv')\n", "ira3 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_3.csv')\n", "ira4 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_4.csv')\n", "# ira5 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_5.csv') # has mixed types\n", "ira6 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_6.csv')\n", "ira7 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_7.csv')\n", "ira8 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_8.csv')\n", "ira9 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_9.csv')\n", "# ira10 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_10.csv') # has mixed types\n", "ira11 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_11.csv')\n", "# ira12 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_12.csv') # has mixed types\n", "ira13 = pd.read_csv('data/russian-troll-tweets/IRAhandle_tweets_13.csv')\n", "\n", "df = pd.concat([ira13,ira11,ira9,ira8,ira7,ira6,ira4,ira3,ira2,ira1])\n", "df.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 3, "id": "194a8799-893b-4418-b145-f24ce111f0f9", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 2232523 entries, 0 to 2232522\n", "Data columns (total 22 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 index 2232523 non-null int64 \n", " 1 external_author_id 2232523 non-null int64 \n", " 2 author 2232523 non-null object\n", " 3 content 2232522 non-null object\n", " 4 region 2225360 non-null object\n", " 5 language 2232523 non-null object\n", " 6 publish_date 2232523 non-null object\n", " 7 harvested_date 2232523 non-null object\n", " 8 following 2232523 non-null int64 \n", " 9 followers 2232523 non-null int64 \n", " 10 updates 2232523 non-null int64 \n", " 11 post_type 1020610 non-null object\n", " 12 account_type 2232523 non-null object\n", " 13 retweet 2232523 non-null int64 \n", " 14 account_category 2232523 non-null object\n", " 15 new_june_2018 2232523 non-null int64 \n", " 16 alt_external_id 2232523 non-null int64 \n", " 17 tweet_id 2232523 non-null int64 \n", " 18 article_url 2232523 non-null object\n", " 19 tco1_step1 1608632 non-null object\n", " 20 tco2_step1 538066 non-null object\n", " 21 tco3_step1 13690 non-null object\n", "dtypes: int64(9), object(13)\n", "memory usage: 374.7+ MB\n" ] } ], "source": [ "df.info(show_counts=True)" ] }, { "cell_type": "code", "execution_count": 4, "id": "fc58d1a7-6e84-41ec-a616-269d012b106c", "metadata": { "tags": [] }, "outputs": [], "source": [ "df = df[df.language == 'English']" ] }, { "cell_type": "code", "execution_count": 5, "id": "5873887b-f2ea-412f-aa1d-2f2eac02ae96", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 1605873 entries, 11 to 2232522\n", "Data columns (total 22 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 index 1605873 non-null int64 \n", " 1 external_author_id 1605873 non-null int64 \n", " 2 author 1605873 non-null object\n", " 3 content 1605872 non-null object\n", " 4 region 1604847 non-null object\n", " 5 language 1605873 non-null object\n", " 6 publish_date 1605873 non-null object\n", " 7 harvested_date 1605873 non-null object\n", " 8 following 1605873 non-null int64 \n", " 9 followers 1605873 non-null int64 \n", " 10 updates 1605873 non-null int64 \n", " 11 post_type 682199 non-null object\n", " 12 account_type 1605873 non-null object\n", " 13 retweet 1605873 non-null int64 \n", " 14 account_category 1605873 non-null object\n", " 15 new_june_2018 1605873 non-null int64 \n", " 16 alt_external_id 1605873 non-null int64 \n", " 17 tweet_id 1605873 non-null int64 \n", " 18 article_url 1605873 non-null object\n", " 19 tco1_step1 1051365 non-null object\n", " 20 tco2_step1 361124 non-null object\n", " 21 tco3_step1 12955 non-null object\n", "dtypes: int64(9), object(13)\n", "memory usage: 281.8+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "9520567b-8886-4e35-a1b4-51be2a2fd26d", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "array(['Russian', 'German', 'Koch', 'Left', 'Right', '?', 'local',\n", " 'Italian', 'Hashtager', 'Arabic', 'news', 'French', 'Spanish',\n", " 'Commercial', 'ZAPOROSHIA'], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.account_type.unique()" ] }, { "cell_type": "code", "execution_count": 7, "id": "83072d1e-60d0-4a1a-9af0-d73030ed6398", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "11 9/15/2015 17:43\n", "12 9/15/2015 17:55\n", "32 9/16/2015 8:04\n", "86 9/20/2015 9:11\n", "190 9/28/2015 17:58\n", "Name: publish_date, dtype: object" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.publish_date.head()" ] }, { "cell_type": "code", "execution_count": 8, "id": "b0b2e5fa-21fa-4d63-8f47-2d4cf0f79c57", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "2232518 3/8/2017 8:59\n", "2232519 3/8/2017 8:59\n", "2232520 3/8/2017 8:59\n", "2232521 3/8/2017 8:59\n", "2232522 3/8/2017 8:59\n", "Name: publish_date, dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.publish_date.tail()" ] }, { "cell_type": "code", "execution_count": 9, "id": "a58504b0-24c2-4da9-8773-49713fb0ece9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexexternal_author_idauthorcontentregionlanguagepublish_dateharvested_datefollowingfollowers...account_typeretweetaccount_categorynew_june_2018alt_external_idtweet_idarticle_urltco1_step1tco2_step1tco3_step1
11112589513234ZUBOVNIK.@McFaul same to you! http://t.co/BbPvCzR0kxUnknownEnglish9/15/2015 17:439/15/2015 17:43355922650...Russian0NonEnglish02589513234643842467972968448http://twitter.com/zubovnik/statuses/643842467...https://twitter.com/zubovnik/status/6438424679...NaNNaN
12122589513234ZUBOVNIK'@McFaul in twitter?'UnknownEnglish9/15/2015 17:559/15/2015 17:56355922650...Russian0NonEnglish02589513234643845691362668544http://twitter.com/zubovnik/statuses/643845691...NaNNaNNaN
32322589513234ZUBOVNIK'@McFaul US stop bombing Zivilisten in Syrien'UnknownEnglish9/16/2015 8:049/16/2015 8:04355922659...Russian0NonEnglish02589513234644059347501363200http://twitter.com/zubovnik/statuses/644059347...NaNNaNNaN
86862589513234ZUBOVNIK'@realDonaldTrump @MoeHoward86 @YouTube Good l...UnknownEnglish9/20/2015 9:119/20/2015 9:11408922453...Russian0NonEnglish02589513234645525713626529792http://twitter.com/zubovnik/statuses/645525713...NaNNaNNaN
1901902589513234ZUBOVNIKSoviet soldiers marching on 1943. Notice the f...UnknownEnglish9/28/2015 17:589/28/2015 17:58396422227...Russian1NonEnglish02589513234648557409682763776http://twitter.com/zubovnik/statuses/648557409...https://twitter.com/MatEvidence/status/6485570...NaNNaN
\n", "

5 rows × 22 columns

\n", "
" ], "text/plain": [ " index external_author_id author \\\n", "11 11 2589513234 ZUBOVNIK \n", "12 12 2589513234 ZUBOVNIK \n", "32 32 2589513234 ZUBOVNIK \n", "86 86 2589513234 ZUBOVNIK \n", "190 190 2589513234 ZUBOVNIK \n", "\n", " content region language \\\n", "11 .@McFaul same to you! http://t.co/BbPvCzR0kx Unknown English \n", "12 '@McFaul in twitter?' Unknown English \n", "32 '@McFaul US stop bombing Zivilisten in Syrien' Unknown English \n", "86 '@realDonaldTrump @MoeHoward86 @YouTube Good l... Unknown English \n", "190 Soviet soldiers marching on 1943. Notice the f... Unknown English \n", "\n", " publish_date harvested_date following followers ... \\\n", "11 9/15/2015 17:43 9/15/2015 17:43 3559 22650 ... \n", "12 9/15/2015 17:55 9/15/2015 17:56 3559 22650 ... \n", "32 9/16/2015 8:04 9/16/2015 8:04 3559 22659 ... \n", "86 9/20/2015 9:11 9/20/2015 9:11 4089 22453 ... \n", "190 9/28/2015 17:58 9/28/2015 17:58 3964 22227 ... \n", "\n", " account_type retweet account_category new_june_2018 alt_external_id \\\n", "11 Russian 0 NonEnglish 0 2589513234 \n", "12 Russian 0 NonEnglish 0 2589513234 \n", "32 Russian 0 NonEnglish 0 2589513234 \n", "86 Russian 0 NonEnglish 0 2589513234 \n", "190 Russian 1 NonEnglish 0 2589513234 \n", "\n", " tweet_id article_url \\\n", "11 643842467972968448 http://twitter.com/zubovnik/statuses/643842467... \n", "12 643845691362668544 http://twitter.com/zubovnik/statuses/643845691... \n", "32 644059347501363200 http://twitter.com/zubovnik/statuses/644059347... \n", "86 645525713626529792 http://twitter.com/zubovnik/statuses/645525713... \n", "190 648557409682763776 http://twitter.com/zubovnik/statuses/648557409... \n", "\n", " tco1_step1 tco2_step1 tco3_step1 \n", "11 https://twitter.com/zubovnik/status/6438424679... NaN NaN \n", "12 NaN NaN NaN \n", "32 NaN NaN NaN \n", "86 NaN NaN NaN \n", "190 https://twitter.com/MatEvidence/status/6485570... NaN NaN \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 10, "id": "667ba830-6e51-4cc9-82ff-e73db03e6f0e", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexexternal_author_idauthorcontentregionlanguagepublish_dateharvested_datefollowingfollowers...account_typeretweetaccount_categorynew_june_2018alt_external_idtweet_idarticle_urltco1_step1tco2_step1tco3_step1
22325182438862497991305AUSTINLOVESBEERBREAKING: Killer avalanche sweeps three skiers...United StatesEnglish3/8/2017 8:593/8/2017 8:594134...Right1RightTroll02497991305839400198002503680http://twitter.com/2497991305/statuses/8394001...https://twitter.com/Daily_Star/status/83938477...http://bit.ly/2lWNDntNaN
22325192438872497991305AUSTINLOVESBEERWhy men should support International Women’s D...United StatesEnglish3/8/2017 8:593/8/2017 9:004134...Right1RightTroll02497991305839400290168135680http://twitter.com/2497991305/statuses/8394002...http://trib.al/xiMs3mdNaNNaN
22325202438882497991305AUSTINLOVESBEERHow we can rebuild trust in a UK divided by in...United StatesEnglish3/8/2017 8:593/8/2017 8:594134...Right1RightTroll02497991305839400090582179840http://twitter.com/2497991305/statuses/8394000...http://trib.al/l3iyCVFNaNNaN
22325212438892497991305AUSTINLOVESBEERJohn Humphrys accused of patronising Angela Ra...United StatesEnglish3/8/2017 8:593/8/2017 8:594134...Right1RightTroll02497991305839400131325648896http://twitter.com/2497991305/statuses/8394001...http://bit.ly/2m0OQL7NaNNaN
22325222438902497991305AUSTINLOVESBEERFossilized poop found in 180-million-year-old ...United StatesEnglish3/8/2017 8:593/8/2017 8:594134...Right1RightTroll02497991305839400253413437440http://twitter.com/2497991305/statuses/8394002...http://dailym.ai/2lV5BXfNaNNaN
\n", "

5 rows × 22 columns

\n", "
" ], "text/plain": [ " index external_author_id author \\\n", "2232518 243886 2497991305 AUSTINLOVESBEER \n", "2232519 243887 2497991305 AUSTINLOVESBEER \n", "2232520 243888 2497991305 AUSTINLOVESBEER \n", "2232521 243889 2497991305 AUSTINLOVESBEER \n", "2232522 243890 2497991305 AUSTINLOVESBEER \n", "\n", " content region \\\n", "2232518 BREAKING: Killer avalanche sweeps three skiers... United States \n", "2232519 Why men should support International Women’s D... United States \n", "2232520 How we can rebuild trust in a UK divided by in... United States \n", "2232521 John Humphrys accused of patronising Angela Ra... United States \n", "2232522 Fossilized poop found in 180-million-year-old ... United States \n", "\n", " language publish_date harvested_date following followers ... \\\n", "2232518 English 3/8/2017 8:59 3/8/2017 8:59 41 34 ... \n", "2232519 English 3/8/2017 8:59 3/8/2017 9:00 41 34 ... \n", "2232520 English 3/8/2017 8:59 3/8/2017 8:59 41 34 ... \n", "2232521 English 3/8/2017 8:59 3/8/2017 8:59 41 34 ... \n", "2232522 English 3/8/2017 8:59 3/8/2017 8:59 41 34 ... \n", "\n", " account_type retweet account_category new_june_2018 alt_external_id \\\n", "2232518 Right 1 RightTroll 0 2497991305 \n", "2232519 Right 1 RightTroll 0 2497991305 \n", "2232520 Right 1 RightTroll 0 2497991305 \n", "2232521 Right 1 RightTroll 0 2497991305 \n", "2232522 Right 1 RightTroll 0 2497991305 \n", "\n", " tweet_id \\\n", "2232518 839400198002503680 \n", "2232519 839400290168135680 \n", "2232520 839400090582179840 \n", "2232521 839400131325648896 \n", "2232522 839400253413437440 \n", "\n", " article_url \\\n", "2232518 http://twitter.com/2497991305/statuses/8394001... \n", "2232519 http://twitter.com/2497991305/statuses/8394002... \n", "2232520 http://twitter.com/2497991305/statuses/8394000... \n", "2232521 http://twitter.com/2497991305/statuses/8394001... \n", "2232522 http://twitter.com/2497991305/statuses/8394002... \n", "\n", " tco1_step1 \\\n", "2232518 https://twitter.com/Daily_Star/status/83938477... \n", "2232519 http://trib.al/xiMs3md \n", "2232520 http://trib.al/l3iyCVF \n", "2232521 http://bit.ly/2m0OQL7 \n", "2232522 http://dailym.ai/2lV5BXf \n", "\n", " tco2_step1 tco3_step1 \n", "2232518 http://bit.ly/2lWNDnt NaN \n", "2232519 NaN NaN \n", "2232520 NaN NaN \n", "2232521 NaN NaN \n", "2232522 NaN NaN \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }