{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "dd109cd7-ce53-450f-adc7-075e49bd5906", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T16:53:36.802255Z", "iopub.status.busy": "2022-06-28T16:53:36.801869Z", "iopub.status.idle": "2022-06-28T16:53:37.024526Z", "shell.execute_reply": "2022-06-28T16:53:37.023837Z", "shell.execute_reply.started": "2022-06-28T16:53:36.802205Z" }, "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 14, "id": "a1349058-690b-4f74-8255-5107685ee2b9", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T17:57:25.196984Z", "iopub.status.busy": "2022-06-28T17:57:25.196516Z", "iopub.status.idle": "2022-06-28T17:57:27.522819Z", "shell.execute_reply": "2022-06-28T17:57:27.521922Z", "shell.execute_reply.started": "2022-06-28T17:57:25.196954Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(227513, 17)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load\n", "tweets = pd.read_csv('data/ukraine/UkraineCombinedTweetsDeduped_FEB27.csv')\n", "tweets = tweets[tweets.language == 'en']\n", "tweets.drop('Unnamed: 0',axis=1,inplace=True)\n", "tweets.shape" ] }, { "cell_type": "code", "execution_count": 17, "id": "e6e4c055-6cb9-4de7-a4b7-f71624041265", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T17:58:04.622027Z", "iopub.status.busy": "2022-06-28T17:58:04.621398Z", "iopub.status.idle": "2022-06-28T17:58:04.635960Z", "shell.execute_reply": "2022-06-28T17:58:04.634756Z", "shell.execute_reply.started": "2022-06-28T17:58:04.621997Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "158184" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(tweets.userid.unique())" ] }, { "cell_type": "code", "execution_count": 3, "id": "353d00f8-17ed-4a37-b59c-e45c95fdcabb", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T16:53:39.804941Z", "iopub.status.busy": "2022-06-28T16:53:39.804771Z", "iopub.status.idle": "2022-06-28T16:53:39.865124Z", "shell.execute_reply": "2022-06-28T16:53:39.864400Z", "shell.execute_reply.started": "2022-06-28T16:53:39.804926Z" }, "tags": [] }, "outputs": [], "source": [ "# drop unique user ids\n", "non_unique = tweets[tweets.duplicated(subset='userid',keep=False)]\n", "non_unique.reset_index(inplace=True)\n", "del tweets" ] }, { "cell_type": "code", "execution_count": 4, "id": "61c6f211-3dd9-44a0-bb07-4cdf0cd822d8", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T16:53:39.866871Z", "iopub.status.busy": "2022-06-28T16:53:39.866680Z", "iopub.status.idle": "2022-06-28T16:53:39.948021Z", "shell.execute_reply": "2022-06-28T16:53:39.947346Z", "shell.execute_reply.started": "2022-06-28T16:53:39.866856Z" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_41453/2621844804.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " users_times['tweetcreatedts'] = pd.to_datetime(users_times.tweetcreatedts).values.astype(np.int64)\n" ] } ], "source": [ "# only interested in userid and creation time right now\n", "users_times = non_unique[['userid','tweetcreatedts']]\n", "users_times['tweetcreatedts'] = pd.to_datetime(users_times.tweetcreatedts).values.astype(np.int64)\n", "del non_unique" ] }, { "cell_type": "code", "execution_count": 5, "id": "7c01f422-1fd4-4c7a-af6f-d9a886853c4e", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T16:53:39.948883Z", "iopub.status.busy": "2022-06-28T16:53:39.948707Z", "iopub.status.idle": "2022-06-28T16:53:40.566388Z", "shell.execute_reply": "2022-06-28T16:53:40.565709Z", "shell.execute_reply.started": "2022-06-28T16:53:39.948866Z" }, "tags": [] }, "outputs": [], "source": [ "# dictionary with list of all post times attached to userid\n", "times_dict = users_times.groupby('userid')['tweetcreatedts'].apply(list).to_dict()\n", "del users_times" ] }, { "cell_type": "code", "execution_count": 6, "id": "2f90bc27-5cc0-4b76-adfa-1282253001c1", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T16:53:40.567429Z", "iopub.status.busy": "2022-06-28T16:53:40.567260Z", "iopub.status.idle": "2022-06-28T16:53:41.004333Z", "shell.execute_reply": "2022-06-28T16:53:41.003369Z", "shell.execute_reply.started": "2022-06-28T16:53:40.567414Z" }, "tags": [] }, "outputs": [], "source": [ "# transpose to cols by userid, cols filled with post times and empy spots filled with NaNs\n", "timeseries = pd.DataFrame.from_dict(times_dict,orient='index').T\n", "del times_dict" ] }, { "cell_type": "code", "execution_count": 7, "id": "436d7869-0a4d-473f-a6ce-5deb62ceaaa8", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T16:53:41.006385Z", "iopub.status.busy": "2022-06-28T16:53:41.006073Z", "iopub.status.idle": "2022-06-28T16:55:51.905601Z", "shell.execute_reply": "2022-06-28T16:55:51.904885Z", "shell.execute_reply.started": "2022-06-28T16:53:41.006368Z" }, "tags": [] }, "outputs": [], "source": [ "# find the bots! the easy ones anyway\n", "bots = timeseries.corr(method='pearson',min_periods=4) # pearson == only positive corr\n", "del timeseries" ] }, { "cell_type": "code", "execution_count": 8, "id": "cbbe80c0-81cc-4a5a-9ae3-a3c58e2f6ce9", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T16:55:51.906614Z", "iopub.status.busy": "2022-06-28T16:55:51.906445Z", "iopub.status.idle": "2022-06-28T16:55:51.910725Z", "shell.execute_reply": "2022-06-28T16:55:51.910051Z", "shell.execute_reply.started": "2022-06-28T16:55:51.906599Z" }, "tags": [] }, "outputs": [], "source": [ "# remove self correlation\n", "np.fill_diagonal(bots.values, 0)" ] }, { "cell_type": "code", "execution_count": 9, "id": "43c1f1eb-89a4-48b8-9624-060c9645b594", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T16:55:51.911612Z", "iopub.status.busy": "2022-06-28T16:55:51.911453Z", "iopub.status.idle": "2022-06-28T16:55:58.200764Z", "shell.execute_reply": "2022-06-28T16:55:58.200024Z", "shell.execute_reply.started": "2022-06-28T16:55:51.911597Z" }, "tags": [] }, "outputs": [], "source": [ "sum0 = bots.sum()" ] }, { "cell_type": "code", "execution_count": 13, "id": "cfebc713-bbca-45f8-af6b-af6581f399ff", "metadata": { "execution": { "iopub.execute_input": "2022-06-28T17:54:53.515583Z", "iopub.status.busy": "2022-06-28T17:54:53.514909Z", "iopub.status.idle": "2022-06-28T17:54:53.524870Z", "shell.execute_reply": "2022-06-28T17:54:53.524094Z", "shell.execute_reply.started": "2022-06-28T17:54:53.515546Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "2758100418 6406.690738\n", "169433608 6406.604723\n", "1420510424697610252 6406.145441\n", "39565575 6405.417779\n", "1252854227547729921 6404.559219\n", " ... \n", "259584974 4237.422438\n", "2850509662 4212.169727\n", "82210282 4210.726707\n", "1310836644946145286 4206.247469\n", "85844581 4162.165699\n", "Length: 7100, dtype: float64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum0[sum0>0].sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "f00b1151-e3b4-444d-b7e9-da6c7ea29114", "metadata": {}, "outputs": [], "source": [ "# " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }