bots/Untitled.ipynb
2024-01-12 09:27:09 -05:00

319 lines
9.1 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dd109cd7-ce53-450f-adc7-075e49bd5906",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T16:53:36.802255Z",
"iopub.status.busy": "2022-06-28T16:53:36.801869Z",
"iopub.status.idle": "2022-06-28T16:53:37.024526Z",
"shell.execute_reply": "2022-06-28T16:53:37.023837Z",
"shell.execute_reply.started": "2022-06-28T16:53:36.802205Z"
},
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a1349058-690b-4f74-8255-5107685ee2b9",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T17:57:25.196984Z",
"iopub.status.busy": "2022-06-28T17:57:25.196516Z",
"iopub.status.idle": "2022-06-28T17:57:27.522819Z",
"shell.execute_reply": "2022-06-28T17:57:27.521922Z",
"shell.execute_reply.started": "2022-06-28T17:57:25.196954Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"(227513, 17)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# load\n",
"tweets = pd.read_csv('data/ukraine/UkraineCombinedTweetsDeduped_FEB27.csv')\n",
"tweets = tweets[tweets.language == 'en']\n",
"tweets.drop('Unnamed: 0',axis=1,inplace=True)\n",
"tweets.shape"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e6e4c055-6cb9-4de7-a4b7-f71624041265",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T17:58:04.622027Z",
"iopub.status.busy": "2022-06-28T17:58:04.621398Z",
"iopub.status.idle": "2022-06-28T17:58:04.635960Z",
"shell.execute_reply": "2022-06-28T17:58:04.634756Z",
"shell.execute_reply.started": "2022-06-28T17:58:04.621997Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"158184"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tweets.userid.unique())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "353d00f8-17ed-4a37-b59c-e45c95fdcabb",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T16:53:39.804941Z",
"iopub.status.busy": "2022-06-28T16:53:39.804771Z",
"iopub.status.idle": "2022-06-28T16:53:39.865124Z",
"shell.execute_reply": "2022-06-28T16:53:39.864400Z",
"shell.execute_reply.started": "2022-06-28T16:53:39.804926Z"
},
"tags": []
},
"outputs": [],
"source": [
"# drop unique user ids\n",
"non_unique = tweets[tweets.duplicated(subset='userid',keep=False)]\n",
"non_unique.reset_index(inplace=True)\n",
"del tweets"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "61c6f211-3dd9-44a0-bb07-4cdf0cd822d8",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T16:53:39.866871Z",
"iopub.status.busy": "2022-06-28T16:53:39.866680Z",
"iopub.status.idle": "2022-06-28T16:53:39.948021Z",
"shell.execute_reply": "2022-06-28T16:53:39.947346Z",
"shell.execute_reply.started": "2022-06-28T16:53:39.866856Z"
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_41453/2621844804.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" users_times['tweetcreatedts'] = pd.to_datetime(users_times.tweetcreatedts).values.astype(np.int64)\n"
]
}
],
"source": [
"# only interested in userid and creation time right now\n",
"users_times = non_unique[['userid','tweetcreatedts']]\n",
"users_times['tweetcreatedts'] = pd.to_datetime(users_times.tweetcreatedts).values.astype(np.int64)\n",
"del non_unique"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7c01f422-1fd4-4c7a-af6f-d9a886853c4e",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T16:53:39.948883Z",
"iopub.status.busy": "2022-06-28T16:53:39.948707Z",
"iopub.status.idle": "2022-06-28T16:53:40.566388Z",
"shell.execute_reply": "2022-06-28T16:53:40.565709Z",
"shell.execute_reply.started": "2022-06-28T16:53:39.948866Z"
},
"tags": []
},
"outputs": [],
"source": [
"# dictionary with list of all post times attached to userid\n",
"times_dict = users_times.groupby('userid')['tweetcreatedts'].apply(list).to_dict()\n",
"del users_times"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2f90bc27-5cc0-4b76-adfa-1282253001c1",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T16:53:40.567429Z",
"iopub.status.busy": "2022-06-28T16:53:40.567260Z",
"iopub.status.idle": "2022-06-28T16:53:41.004333Z",
"shell.execute_reply": "2022-06-28T16:53:41.003369Z",
"shell.execute_reply.started": "2022-06-28T16:53:40.567414Z"
},
"tags": []
},
"outputs": [],
"source": [
"# transpose to cols by userid, cols filled with post times and empy spots filled with NaNs\n",
"timeseries = pd.DataFrame.from_dict(times_dict,orient='index').T\n",
"del times_dict"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "436d7869-0a4d-473f-a6ce-5deb62ceaaa8",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T16:53:41.006385Z",
"iopub.status.busy": "2022-06-28T16:53:41.006073Z",
"iopub.status.idle": "2022-06-28T16:55:51.905601Z",
"shell.execute_reply": "2022-06-28T16:55:51.904885Z",
"shell.execute_reply.started": "2022-06-28T16:53:41.006368Z"
},
"tags": []
},
"outputs": [],
"source": [
"# find the bots! the easy ones anyway\n",
"bots = timeseries.corr(method='pearson',min_periods=4) # pearson == only positive corr\n",
"del timeseries"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "cbbe80c0-81cc-4a5a-9ae3-a3c58e2f6ce9",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T16:55:51.906614Z",
"iopub.status.busy": "2022-06-28T16:55:51.906445Z",
"iopub.status.idle": "2022-06-28T16:55:51.910725Z",
"shell.execute_reply": "2022-06-28T16:55:51.910051Z",
"shell.execute_reply.started": "2022-06-28T16:55:51.906599Z"
},
"tags": []
},
"outputs": [],
"source": [
"# remove self correlation\n",
"np.fill_diagonal(bots.values, 0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "43c1f1eb-89a4-48b8-9624-060c9645b594",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T16:55:51.911612Z",
"iopub.status.busy": "2022-06-28T16:55:51.911453Z",
"iopub.status.idle": "2022-06-28T16:55:58.200764Z",
"shell.execute_reply": "2022-06-28T16:55:58.200024Z",
"shell.execute_reply.started": "2022-06-28T16:55:51.911597Z"
},
"tags": []
},
"outputs": [],
"source": [
"sum0 = bots.sum()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "cfebc713-bbca-45f8-af6b-af6581f399ff",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-28T17:54:53.515583Z",
"iopub.status.busy": "2022-06-28T17:54:53.514909Z",
"iopub.status.idle": "2022-06-28T17:54:53.524870Z",
"shell.execute_reply": "2022-06-28T17:54:53.524094Z",
"shell.execute_reply.started": "2022-06-28T17:54:53.515546Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"2758100418 6406.690738\n",
"169433608 6406.604723\n",
"1420510424697610252 6406.145441\n",
"39565575 6405.417779\n",
"1252854227547729921 6404.559219\n",
" ... \n",
"259584974 4237.422438\n",
"2850509662 4212.169727\n",
"82210282 4210.726707\n",
"1310836644946145286 4206.247469\n",
"85844581 4162.165699\n",
"Length: 7100, dtype: float64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum0[sum0>0].sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f00b1151-e3b4-444d-b7e9-da6c7ea29114",
"metadata": {},
"outputs": [],
"source": [
"# "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}