886 lines
160 KiB
Text
886 lines
160 KiB
Text
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "145fbd89-a3ee-4628-8e83-4069e504fb03",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Initial EDA\n",
|
|||
|
"What's in here that looks useful?"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"id": "075030c8-e9fe-4fce-b752-d910d64ebfb1",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:39.214089Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:39.213748Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:40.335156Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:40.334266Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:39.214015Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"sns.set_theme(style='darkgrid')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "c14f6c4c-fb4b-4e69-817e-28dbae07c298",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## First load the data and get basic info"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"id": "94c1ae77-ccbd-42ce-9e34-3b0f96342002",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:40.336431Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:40.336185Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:40.738364Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:40.737807Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:40.336415Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 48565 entries, 0 to 48564\n",
|
|||
|
"Data columns (total 23 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 author 48544 non-null object \n",
|
|||
|
" 1 created_utc 48565 non-null float64\n",
|
|||
|
" 2 distinguished 47 non-null object \n",
|
|||
|
" 3 edited 48565 non-null object \n",
|
|||
|
" 4 id 48565 non-null object \n",
|
|||
|
" 5 is_original_content 48565 non-null bool \n",
|
|||
|
" 6 is_self 48565 non-null bool \n",
|
|||
|
" 7 link_flair_text 25836 non-null object \n",
|
|||
|
" 8 locked 48565 non-null bool \n",
|
|||
|
" 9 name 48565 non-null object \n",
|
|||
|
" 10 num_comments 48565 non-null int64 \n",
|
|||
|
" 11 over_18 48565 non-null bool \n",
|
|||
|
" 12 permalink 48565 non-null object \n",
|
|||
|
" 13 score 48565 non-null int64 \n",
|
|||
|
" 14 selftext 2243 non-null object \n",
|
|||
|
" 15 spoiler 48565 non-null bool \n",
|
|||
|
" 16 stickied 48565 non-null bool \n",
|
|||
|
" 17 subreddit 48565 non-null object \n",
|
|||
|
" 18 title 48565 non-null object \n",
|
|||
|
" 19 upvote_ratio 48565 non-null float64\n",
|
|||
|
" 20 url 48565 non-null object \n",
|
|||
|
" 21 utc_now 48565 non-null float64\n",
|
|||
|
" 22 post_age 48565 non-null float64\n",
|
|||
|
"dtypes: bool(6), float64(4), int64(2), object(11)\n",
|
|||
|
"memory usage: 6.6+ MB\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Load from \"db\"\n",
|
|||
|
"db = pd.read_csv('data/startingover.csv')\n",
|
|||
|
"db.info()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "b2759ec8-a6f9-43fc-926e-8fb85edf49df",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"'num_comments' is going to be the target, let's get a closer look at the distribution"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"id": "4a036089-f235-4a48-b57a-6b5f7a8f598d",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:40.740085Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:40.739890Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:41.471046Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:41.470383Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:40.740063Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"count 48565.000000\n",
|
|||
|
"mean 84.723463\n",
|
|||
|
"std 346.739562\n",
|
|||
|
"min 0.000000\n",
|
|||
|
"25% 10.000000\n",
|
|||
|
"50% 26.000000\n",
|
|||
|
"75% 69.000000\n",
|
|||
|
"max 27985.000000\n",
|
|||
|
"Name: num_comments, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEMCAYAAADXiYGSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAV1klEQVR4nO3df5BdZ33f8ffuai2wJYhne13HsoxwQd8MikERdTwdsJMwIelMfrS1HRo1iWvy06ltFSaTOiXETUmcAuOBxrUSOYEGFbCSCSROYTpxxx2SWFMyYYjljEn9tU1sI2SDlsWAVz/Xu9s/zhGsV3dXz73ae++5V+/XjGa1z3PPOd/nHul+7rnPueeMLS4uIknSmYwPugBJ0nAwMCRJRQwMSVIRA0OSVMTAkCQVWTfoAnpkPXAl8CwwP+BaJGlYTADfDnwGOLG8c1QD40rgwUEXIUlD6mpg//LGUQ2MZwGee+4ICwudf89kamoDMzOza15UE4zq2EZ1XDC6Y3NczTM+PsaFF14A9WvocqMaGPMACwuLXQXGqWVH1aiObVTHBaM7NsfVWG0/ynfSW5JUxMCQJBUxMCRJRQwMSVIRA0OSVMTAkCQVMTAkSUVG9XsYZ+Xrsyc4cnKhbd9L108w7k2nJJ2DDIw2Zo/OsftjB9r23Xz9di44zwMzSeceX/kkSUUMDElSEQNDklTEwJAkFTEwJElFDAxJUhEDQ5JUxMCQJBUxMCRJRQwMSVIRA0OSVMTAkCQVMTAkSUUMDElSEQNDklTEwJAkFTEwJElFDAxJUhEDQ5JUxMCQJBUxMCRJRQwMSVIRA0OSVGQoAiMi9kbEBwZdhySdy9b1a0MRcSdwHbAFuCIzH6nbtwJ7gSlgBrghMx9fstwtwP3Am/pVqyTpdP08wrgPuAZ4eln7HmB3Zm4FdgP3nOqIiNcD5wP/t081SpJW0LcjjMzcDxAR32yLiIuAHcCb66Z9wN0R0crMaeCHgM3AHcCOiLg6Mx8s3ebU1Iauaj10eJbJyfZPzeTkBK1Wd+ttilZr46BL6IlRHReM7tgc13DpW2CsYDNwKDPnATJzPiKeqdunM/NdABGxBXhnJ2EBMDMzy8LCYhdljTE390Lbnrm5eaann+9inc3Qam0c6vpXMqrjgtEdm+NqnvHxsVXfaA86MIpk5lPAzw66Dkk6lw36LKmDwKaImACof15St0uSGmSggZGZh4EDwM66aSfwUD1/IUlqkL4FRkTcFRFfBC4FHoiIz9VdNwG3RsRjwK3175KkhunnWVK7gF1t2h8FrupXHZKk7gx6DkOSNCQMDElSEQNDklTEwJAkFTEwJElFDAxJUhEDQ5JUxMCQJBUxMCRJRQwMSVIRA0OSVMTAkCQVMTAkSUUMDElSEQNDklTEwJAkFTEwJElFDAxJUhEDQ5JUxMCQJBUxMCRJRQwMSVIRA0OSVMTAkCQVMTAkSUUMDElSEQNDklTEwJAkFTEwJElFDAxJUhEDQ5JUxMCQJBVZN+gCVhMRrwH+PTBBVetbM3NxsFVJ0rmpb4EREXcC1wFbgCsy85G6fSuwF5gCZoAbMvNxgMz8e+AX6sf9MXABMNuvmiVJ39LPj6TuA64Bnl7WvgfYnZlbgd3APUs7I+L7IuKjwFeAo32oU5LURt8CIzP3Z+bBpW0RcRGwA9hXN+0DdkREa8lyn8rMnwBeALb3qVxJ0jKDnsPYDBzKzHmAzJyPiGfq9umI+F7gemAMmAQe6WTlU1Mbuirq0OFZJifbPzWTkxO0Wt2ttylarY2DLqEnRnVcMLpjc1zDZdCBsarM/AvgL7pdfmZmloWFbubIx5ibe6Ftz9zcPNPTz3db0sC1WhuHuv6VjOq4YHTH5riaZ3x8bNU32oM+rfYgsCkiJgDqn5fU7ZKkBhloYGTmYeAAsLNu2gk8lJnTAytKktRW3wIjIu6KiC8ClwIPRMTn6q6bgFsj4jHg1vp3SVLD9G0OIzN3AbvatD8KXNWvOiRJ3Rn0HIYkaUgYGJKkIo0+rbaJzpsc58jJhdPaX7p+gvFFL3MlaXQZGB06dnKePR9/+LT2m6/fzgXnecAmaXT5CidJKmJgSJKKFAdGRPzYCu3Xr105kqSm6uQI44MrtP/eWhQiSWq2M056R8Tl9V/HI+KVVFeOPeVy4HgvCpMkNUvJWVJPAItUQfH5ZX1fAn59jWuSJDXQGQMjM8cBIuIvM/N7el+SJKmJiucwDAtJOrcVf3Gvnr+4g+o2qS+6w0ZmXra2ZUmSmqaTb3rfSzWH8UvA0d6UI0lqqk4CYxvwhsw8/UJKkqSR18n3MP4K+K5eFSJJarZOjjCeAu6PiD+hOp32mzLz9rUsSpLUPJ0ExgXAJ4BJYHNvypEkNVVxYGTmW3tZiCSp2To5rfbylfoy8x/WphxJUlN18pHU0kuEnHLqFnMTa1aRJKmROvlI6kVnVEXExcB/Ah5c66IkSc3T9Q2UMvNLwNuA/7Jm1UiSGuts77gXwPlrUYgkqdk6mfR+kG/NWUAVFNuAd611UZKk5ulk0vsDy34/AjycmY+vYT2SpIbqZNJ7by8LkSQ1WycfSU0C7wR+CrgEeAb4MHBHZp7sTXnD47zJcY6cPP26jC9dP8H44mKbJSRpuHTykdR7ge8GbgKeBl4B/BrwMuDta1/acDl2cp49H3/4tPabr9/OBeed7bkFkjR4nQTGjwGvy8yZ+veMiL8FHsbAkKSR18lb37EO2yVJI6STI4w/Bj4REf8Z+ALVR1LvrNslSSOuk8D4D1QBsZtq0vsQsA/4zR7UBUBEvBF4K7Ae+Fpm3tKrbUmSVnfGwIiINwA/mpm3AbfXf071vQfYAfx1wXruBK4DtgBXZOYjdftWYC8wBcwAN5z6bkdm7gf214/7s4jYkJmznQxQkrQ2SuYw3kF1e9Z2PgX8auG27gOuoTrDaqk9wO7M3Ep19HLP8gUj4oeA/2dYSNLglATGduDPV+h7AHh9yYYyc39mHlzaFhEXUR2h7Kub9gE7IqK15DE3Aldm5q+UbEeS1BslcxgvA84DjrXpmwQ2nsX2NwOHMnMeIDPnI+KZun06In6Yao7kkxGxB/i1zJwuXfnU1Iauijp0eJbJyfZPzdjYWNu+ldonJydotbqro1darbPZZc01quOC0R2b4xouJYHxKPADwJ+16fuBur8nMvOTwKXdLj8zM8vCQjffsh5jbu6Ftj2Li4tt+1Zqn5ubZ3r6+S5q6I1Wa2Oj6lkrozouGN2xOa7mGR8fW/WNdslHUu8H7omIayNiHCAixiPiWqr5h/edRX0HgU0RMVGvd4LqDKyDqy4lSeq7MwZGZt5LdVmQvcDx+iOj48CHgPdm5r5VFj/Tug8DB4CdddNO4KFOPnaSJPVH0fcwMvN9EfEB4J/xrdNfP52Z3yjdUETcBVwLXAw8EBEzmbmN6tpUeyPiduA54IYOxyBJ6oNOLm/+DeD+bjeUmbuAXW3aHwWu6na9kqT+6OSb3uqClz2XNCoMjB7zsueSRoWvWJKkIgaGJKmIgSFJKmJgSJKKGBiSpCIGhiSpiIEhSSpiYEiSihgYkqQiBoYkqYiBIUkqYmBIkooYGJKkIgaGJKmIgSFJKuL9MAbEGytJGjYGxoB4YyVJw8bAaBiPPCQ1lYHRMB55SGoqX4EkSUUMDElSEQNDklTEwJAkFTEwJElFDAxJUhEDQ5JUxMCQJBUxMCRJRQwMSVIRA0OSVKTR15KKiJcD7wfenJmbB12PJJ3L+hYYEXEncB2wBbgiMx+p27cCe4EpYAa4ITMfB8jMrwM/HREP9KvOpvIqtpIGrZ9HGPcBvw08uKx9D7A7Mz8SET8J3AO8qY91DYWVrmL79p07ODZnkEjqvb4FRmbuB4iIb7ZFxEXADuDNddM+4O6IaGXmdL9qG2ZeDl1Svwx6DmMzcCgz5wEycz4inqnbpwEiYjfwHRGxB3hPZj5ZuvKpqQ1dFXXo8CyTk+2fmrGxsbZ9TWufnJyg1Wo//lZrY9v2YTeq44LRHZvjGi6DDowzysybgZu7WXZmZpaFhW4+lhljbu6Ftj2Li4tt+5rWPjc3z/T
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 432x288 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(db.num_comments.describe())\n",
|
|||
|
"sns.histplot(db.num_comments,log_scale=(False,True),bins=50);"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "839e5d30-ab8f-4871-8191-20cf3b1b7c2d",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"This appears to be heavily impacted by outliers, I'll pick a cutoff at 6000 comments and see where that gets us"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"id": "0e5a5c07-51c4-4251-a955-8a16c9500fad",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:41.471990Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:41.471813Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:41.478361Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:41.477689Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:41.471973Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"21"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"len(db[db.num_comments>6000])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "9e465633-ebbd-4497-8b27-52ad0aa48048",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"There are some extreme outliers here, they're skewing the data and there are so few of them that we won't miss them at all. So we'll drop and re-plot"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"id": "4a028cd9-0e67-4e35-865d-9f2cf1fbc23f",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:41.479374Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:41.479182Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:41.912305Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:41.911570Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:41.479357Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"count 48544.000000\n",
|
|||
|
"mean 79.740957\n",
|
|||
|
"std 223.601340\n",
|
|||
|
"min 0.000000\n",
|
|||
|
"25% 10.000000\n",
|
|||
|
"50% 26.000000\n",
|
|||
|
"75% 68.250000\n",
|
|||
|
"max 5876.000000\n",
|
|||
|
"Name: num_comments, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZEAAAEMCAYAAAAF2YvKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAX80lEQVR4nO3df5BlZX3n8Xd3TzMIMxpq0oQMDI5E5mvJIoghbArBmFrjVqJVu0CMs9kQzJqEXWBWazdLgoquCSl1Kd1lnWRI1GRWZZKNMSRau2GLLU2YWnfjKkMKs3wBw49xQKdtUWmYYZru3j/Oabj03Nt97zO37695v6qmuvs599zzfW733M895znnPGOLi4tIklRivN8FSJKGlyEiSSpmiEiSihkikqRihogkqdi6fhewRtYDFwKPA/N9rkWShsUE8MPAl4Fn2llhVEPkQuCufhchSUPqEmBvOw8c1RB5HOCJJ55iYaHz62A2bdrAzMxs14vqF/sz2OzP4BqlvsDq/RkfH+OUU06G+j20HaMaIvMACwuLRSGytO4osT+Dzf4MrlHqC7Tdn7aHARxYlyQVM0QkScUMEUlSMUNEklTMEJEkFTNEJEnFDBFJUrFRvU7kmHxv9hmeOrLQdNmL1k8w7kRekgQYIk3NPj3Hzs/sa7rsmivO5+QT3IGTJPBwliTpGBgikqRihogkqZghIkkqZohIkooZIpKkYoaIJKmYISJJKmaISJKKGSKSpGKGiCSpmCEiSSpmiEiSihkikqRihogkqZghIkkqZohIkooZIpKkYoaIJKmYISJJKjYUIRIRuyPiY/2uQ5L0Qut6taGIuBm4HNgKnJuZ99bt24DdwCZgBrgyMx9oWO9a4A7gJ3tVqySpPb3cE7kduBR4ZFn7LmBnZm4DdgK3Li2IiNcAJwH/q0c1SpI60LMQycy9mbm/sS0iTgUuAPbUTXuACyJiqv75Z4CzgZuAiyPikl7VK0laXc8OZ7WwBTiQmfMAmTkfEY/V7dOZ+X6AiNgKvDsz7+rkyTdt2lBU1IGDs0xONn9pJicnmJoqe95+mpra2O8Susr+DLZR6s8o9QW6359+h0hbMvNh4O2drjczM8vCwmLBFseYm3u26ZK5uXmmp58seM7+mZraOHQ1r8T+DLZR6s8o9QVW78/4+FjHH777fXbWfuD0iJgAqL9urtslSQOuryGSmQeBfcD2umk7cHdmTvetKElS23oWIhFxS0R8AzgDuDMivlYvuhq4LiLuB66rf5YkDYGejYlk5g5gR5P2+4CLelWHJKl7+j0mIkkaYoaIJKmYISJJKmaISJKKGSKSpGKGiCSpmCEiSSpmiEiSihkikqRihogkqZghIkkqNhTziQySEybHeerIwlHtL1o/wfhiydwlkjS8DJEOHToyz64/veeo9muuOJ+TT3DHTtLxxXc9SVIxQ0SSVMwQkSQVM0QkScUMEUlSMUNEklTMEJEkFTNEJEnFDBFJUjFDRJJUzBCRJBUzRCRJxQwRSVIxQ0SSVMwQkSQVM0QkScUMEUlSMWc27BKnzZV0PDJEusRpcyUdj3x3kyQVG+g9kYh4JfCvgQmqWt+WmR4bkqQB0bMQiYibgcuBrcC5mXlv3b4N2A1sAmaAKzPzAYDM/DvgV+vH/QlwMjDbq5olSSvr5eGs24FLgUeWte8CdmbmNmAncGvjwoh4fUR8Gvg28HQP6pQktalnIZKZezNzf2NbRJwKXADsqZv2ABdExFTDel/IzJ8HngXO71G5kqQ29HtMZAtwIDPnATJzPiIeq9unI+IngCuAMWASuLeTJ9+0aUNRUQcOzjI52fylGRsba7qsVfvk5ARTU2V1dNPU1MZ+l9BV9mewjVJ/Rqkv0P3+9DtEVpSZXwS+WLr+zMwsCwsl4/BjzM0923TJ4uJi02Wt2ufm5pmefrKghu6ZmtrY9xq6yf4MtlHqzyj1BVbvz/j4WMcfvvt9iu9+4PSImACov26u2yVJA66vIZKZB4F9wPa6aTtwd2ZO960oSVLbehYiEXFLRHwDOAO4MyK+Vi+6GrguIu4Hrqt/liQNgZ6NiWTmDmBHk/b7gIt6VYckqXv6PSYiSRpihogkqdhAn+I7CrxFvKRRZoisMW8RL2mU+S4mSSpmiEiSihkikqRihogkqVjbIRIRP9ui/YrulSNJGiad7Il8vEX773WjEEnS8Fn1FN+IOKv+djwiXkY1t8eSs4DDa1GYJGnwtXOdyIPAIlV4fH3Zsm8C7+tyTZKkIbFqiGTmOEBE/FVmvm7tS5IkDYu2x0QMEEnScm3f9qQeD7kJOB94wfyJmXlmd8uSJA2DTu6ddRvVmMi/AZ5em3KOH96YUdIo6CREzgEuzsyj3/nUMW/MKGkUdPJu9dfAq9eqEEnS8OlkT+Rh4I6I+CzVqb3Pycwbu1mUJGk4dBIiJwOfAyaBLWtTjiRpmLQdIpn5trUsRJI0fDo5xfesVssy8++7U44kaZh0cjir8fYnS5bORZ3oWkWSpKHRyeGsF5zJFRGnAe8F7up2Ucczrx+RNEw62RN5gcz8ZkS8A7if6kJEdYHXj0gaJsf6rhTASd0oRJI0fDoZWL+L58dAoAqPc4D3d7soSdJw6ORw1seW/fwUcE9mPtDFeiRJQ6STgfXda1mIJGn4dHI4axJ4N/ALwGbgMeCTwE2ZeWRtytMSz9qSNIg6OZz1IeDHgKuBR4CXAu8BXgy8s/ulqZFnbUkaRJ2EyM8C52XmTP1zRsRXgXswRCTpuNTJR9ixDtslSSOukz2RPwE+FxH/HniU6nDWu+v2NRERrwXeBqwHvpuZ167VtoaVYyWS+qmTEPl3VKGxk2pg/QCwB/itdlaOiJuBy4GtwLmZeW/dvg3YDWwCZoArl04bzsy9wN76cX8eERsyc7aDmkeeYyWS+mnVd5mIuDgiPpiZRzLzxsx8eWaelJlnU+0hXNDmtm4HLqUalG+0C9iZmduoAurWJjX8DPD/DBBJGiztfFS9gWpq3Ga+ALyrnQ1l5t7M3N/YFhGnUoXQnrppD3BBREw1POYq4MLM/PV2tiNJ6p12DmedD/xli2V3Ap84hu1vAQ5k5jxAZs5HxGN1+3REvInqcNnnI2IX8J7MnG73yTdt2lBU1IGDs0xONn9pxsbGmi4btPbJyQmmpp7v/9TUxqM7M8Tsz2Abpf6MUl+g+/1pJ0ReDJwAHGqybBJYs1c4Mz8PnFG6/szMLAsLJYPLY8zNPdt0yeLiYtNlg9Y+NzfP9PSTQPVHs/T9KLA/g22U+jNKfYHV+zM+Ptbxh+92DmfdB/xUi2U/VS8vtR84PSImAOqvm+t2SdKAaydEPgLcGhGXRcQ4QESMR8RlVIPiHy7deGYeBPYB2+um7cDdnRyyUnNLp/4+dWSBAwdnn/t+YczLeiR1z6qHszLztnoWw93A+oj4NvCDwGHgvZm5Z8UnqEXELcBlwGnAnRExk5nnUN1GZXdE3Ag8AVxZ1hU1ajz1d3Jy3XOHvDz1V1I3tXWdSGZ+OCI+Bvw4z1/P8aXM/H67G8rMHcCOJu33ARe1+zySpMHRya3gvw/csYa1SJKGjMc1JEnFDBFJUjFDRJJUrJMbMGoEeNdfSd1kiBxnvOuvpG7yXUOSVMwQkSQV83CWVrQwNsahZ+aPancMRRIYIlrFoWfm2fmZfUe1O4YiCTycJUk6Bu6JCGh96i/e9FfSCgwRAa1P/b368vP6UI2kYeHhLElSMUNEklTMEJEkFTNEJEnFDBFJUjFDRJJUzBCRJBXzOhF1nffbko4fhoi6zvttSccP/0dLkooZIpKkYoaIJKmYYyIq0vKuv9Dyzr+t1nHAXRpehoiKtLrrL7S+82+rdRxwl4aX/3MlScXcE1HfeZhLGl6GiPrOw1zS8DJENDK8Ul7qPUNEI8Mr5aXeM0Q0sFqNlZww+0wfqpHUzECHSES8BPgI8IbM3NLvetRbrcZK3rH9NZzQ4loUSb3VsxCJiJuBy4GtwLmZeW/dvg3YDWwCZoArM/MBgMz8HvBLEXFnr+qUJLWvlweKbwcuBR5Z1r4L2JmZ24CdwK09rEmSdAx6FiK
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 432x288 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"db = db.drop(db[db.num_comments>6000].index)\n",
|
|||
|
"print(db.num_comments.describe())\n",
|
|||
|
"sns.histplot(db.num_comments,log_scale=(False,True),bins=50);"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "8da9e6ae-24f5-4fc1-b6d6-2ad382a32fcd",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Much clearer picture, also I put the plots on a log scale so we can actually see the distribution, otherwise it'd just be a massive bar at 0 and it would push everything else so far down that you can't see anything. I could probably go further but I don't want to misrepresent the data"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "ebe836cb-249b-4b4c-bde6-a6637a847063",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Comparing Features\n",
|
|||
|
"There are a few features that jumped out at me as interesting:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"id": "d93d2f88-6cf0-43b9-b347-f392c98c350e",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:41.913362Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:41.913187Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:41.921295Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:41.920552Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:41.913345Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Out of 48544 total posts...\n",
|
|||
|
"--------------------------------\n",
|
|||
|
"2495 posts are self posts\n",
|
|||
|
"1001 posts are NSFW (18+)\n",
|
|||
|
"866 posts are marked as spoilers\n",
|
|||
|
"830 posts are of original content\n",
|
|||
|
"212 posts are locked\n",
|
|||
|
"145 posts are stickied\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(f'\\\n",
|
|||
|
"Out of {len(db)} total posts...\\n\\\n",
|
|||
|
"--------------------------------\\n\\\n",
|
|||
|
"{db.is_self.value_counts()[True]} posts are self posts\\n\\\n",
|
|||
|
"{db.over_18.value_counts()[True]} posts are NSFW (18+)\\n\\\n",
|
|||
|
"{db.spoiler.value_counts()[True]} posts are marked as spoilers\\n\\\n",
|
|||
|
"{db.is_original_content.value_counts()[True]} posts are of original content\\n\\\n",
|
|||
|
"{db.locked.value_counts()[True]} posts are locked\\n\\\n",
|
|||
|
"{db.stickied.value_counts()[True]} posts are stickied\\n\\\n",
|
|||
|
"')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "05ce41b2-5669-4d0b-a4d5-ff8d8d7171cb",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"While the number of these posts compared to the total posts isn't significant I feel that they can only add (not much) to the accuracy of the model. They only add one column each so it's cheap as well. If I had more time I would probably create a column for has_flair as well."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "d79c7d53-1ee3-4180-b40f-1069a7f85ae1",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Numerics vs Target\n",
|
|||
|
"Some numeric columns I would like to include are score, post age, upvote ratio, and number of comments (which will become the target). Let's get an idea for what's going on."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"id": "252bb5a6-d24a-4ef2-b093-3a9c780c3812",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:41.922293Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:41.922118Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:50.783370Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:50.782637Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:41.922275Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"count 48544.000000\n",
|
|||
|
"mean 2130.988176\n",
|
|||
|
"std 6702.179836\n",
|
|||
|
"min 45.000000\n",
|
|||
|
"25% 263.000000\n",
|
|||
|
"50% 568.000000\n",
|
|||
|
"75% 1465.000000\n",
|
|||
|
"max 193501.000000\n",
|
|||
|
"Name: score, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAaUAAAGoCAYAAADmTPpwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAB9+UlEQVR4nO39e7hcd3nfDX/WYc579nnraEuWZLTkAzYWBhlsJ46FjQMBHOoWgxPSJn3aNG3T5H3fx+2Vt6F90ye9ErdP0yaBQJLyhMQYSFVjDCbGxuDgMxjbCIO1ZMmSJeu8j7P3HNfp/WPNjGZmz+w9M3tmz9p77s91gaw1s2bdc9Dvu+77dx8Uz/MQBEEQhCCg9toAQRAEQSghoiQIgiAEBhElQRAEITCIKAmCIAiBQURJEARBCAx6rw1YKRcuzLecPjgyEmdmJtMNc9pC7GlMkGwBsWc5gmRPkGyBxfZMTCSVHpoTWPrSU9J1rdcmVCH2NCZItoDYsxxBsidItkDw7AkqfSlKgiAIQjARURIEQRACg4iSIAiCEBhElARBEITAIKIkCIIgBAYRJUEQBCEwiCgJgiAIgUFESRAEQQgMIkqCIAhCYBBREgRBEAKDiJIgCIIQGESUBEEQhMAgoiQIgiAEBhElQRAEITCIKAmCIAiBQURJEARBCAwiSoIgCEJgEFESBEEQAoOIkiAIghAYRJQEQRCEwCCiJAiCIAQGESVBEAQhMIgoCYIgCIFBREkQBEEIDCJKgiAIQmAQURIEQRACg4iSIAiCEBhElARBEITAIKIkCIIgBAYRJUEQBCEwiCgJgiAIgUFESRAEQQgMIkqCIAhCYBBREgRBEAKDiJIgCIIQGESUBEEQhMAgoiQIgiAEhr4Vpa899UavTRAEQRBq6FtR8nptgCAIgrCIvhOlTM7mv9z/Ij88fAHzxEyvzREEQRAq6DtR8vBYyFicncrwjWeP99ocQRAEoYK+E6VENMT/75+9h0smEhw/O4/nSSBPEAQhKPSdKJUYHoiQztlcmM322hRBEAShSB+LUhiA42fne2yJIAiCUKJvRSkZD6NrKsfOpHptiiAIglCkb0VJVRW2bRzg+BnxlARBEIJC34oSgOd6HD83j+24vTZFEARBoM9FaeNojHzB4fmfnOu1KYIgCAJ9LkoTwzEu3TDA373wJq6khguCIPScvhYlRVHYMBzlzFSGr37vDQnjCYIg9Bi91wb0mi3jCc7NZHnkuTd5+uAZjG3DbBiJMToYZTgRIRLWiIQ0IiG1/N+aqqAoCooCCsU/S38v/Tf+n4IgCELzrHVR0lW1vYU/GQ+V//tn37GFybkcZ6YznJ3O8ObZ+Y41bK0ULhSg9N+Vz2Fxg9im3lUTT1Kae6UqVAXcGoOaeZXmNLg1e+rZ0kvEnqUJkj2rZUskrPF/fOhKNgzHlreper26DHgLsLtj2dpEWeNtdi4DjvXaCEEQhDbZARzvtRFBYq2Lkg5c0msjBEEQ2kQ8pRrWuigJgiAI64i+zr4TBEEQgoWIkiAIghAYRJQEQRCEwCCiJAiCIAQGESVBEAQhMIgoCYIgCIFhrYuSjl9Au9Y7UwiCIDSir9a5tf4mLwGOTU0t4LbQT2RkJM7MTKZ7VrWI2NOYINkCYs9yBMmeINkCi+2ZmEg223OrrXUuyCz13te6p9QWuq712oQqxJ7GBMkWEHuWI0j2BMkWCJ49QaUvRUkQBEEIJiJKgiAIQmAQURIEQRACw6olOhiGEQX+CHgfkAOeM03znxmGsRv4AjAGTAGfNE3z9dWySxAEQQgOq+kp3YcvRrtN03w78LvF458FPm2a5m7g08DnVtEmQRAEIUCsiigZhjEAfBL4XdM0PQDTNM8ZhrEB2At8qfjULwF7DcOYWA27BEEQhGCxWuG7Xfihuf9gGMbPAQvAvweywCnTNB0A0zQdwzBOA5cCF1bJNkEQBCEgrJYo6cBO4GXTNP9PwzD2AV8H/mEnXnxsbKDlcyYmkp24dMcQexoTJFtA7FmOINkTJFtgZfa0s86tRVZLlN7EH/n7JQDTNF8wDGMS31PaahiGVvSSNGALcLKVF2+10nliIsmFC/OtXKKriD2NCZItIPYsR5DsCZItsNieVgVqnXV0aPjYquwpmaY5CXwXuA2gmHG3ATgMvAJ8vPjUj+N7UxK6EwRB6ENWs/fdrwOfNwzj/wYs4JdN05w1DOPXgS8YhvEpYAY/IULoQw4eneTRF04wOZdjfCjKHfu2sT9g4RdBELrLqomSaZpvALfUOX4I2Ldadhw8OskTBw5y5sJCeeG7Ztf4al1eaMDBo5N88fHDaJpKPKozmy7wxccPMzQUZ/t4vNfmCYKwSvRVR4fSwjeTylYtfAePTvbatL7n0RdOoGkqkZCGoihEQhqapvLgk0d6bZogCKtIX4lSaeGLhvWqhe/RF0702rS+Z3IuR1iv/jmGdZXz08EZPSAIQvfpK1FqtPBNzuV6ZJFQYnwoSsF2q44VbJcNoxK6E4R+oq9EqdHCNz4U7ZFFQok79m3DcVzyloPneeQtB8dx+egtl/faNEEQVpG+EqXSwpcr2FUL3x37tvXatL7nml3j3HPbboYTYTI5m+FEmHtu2831V2zstWmCIKwia30cekuUsuyeePm0ZN8FkGt2jct3IQh9Tl+JEvgL3/4bdgSq0lsQBEHw6TtRardOqV5hp9zVC4IgdJa+EqVSnVIkrFXVKQFLCkyjws7lzhMEQRBao68SHdqtU2pU2Cn1TYIgCJ2lr0Sp3TolqW8SBEFYHfpKlNqtU5L6JkEQhNWhr0Sp3TqlRoWdUt8kCILQWfoq0aHdOqXS45J9JwiC0F36SpSg/TolKewUBEHoPn0VvhMEQRCCjYiSIAiCEBj6LnwndAfpeCEIQicQURJWjHS8EAShU0j4Tlgx0vFCEIROIaIkrBjpeCEIQqeQ8J2wLMvtF40PRZlNF4iEtPIx6XghCEI7iCgFjKAlDDSzX3THvm188fHD5PE9pILtSscLQRDaQkQpQAQxYaByvwggEtLIF4+XbJKOF4IgdAoRpQDRjACsNpNzOeLR6p9Jvf2ifu14ETTPVhDWOpLoECCCmDAgHdIbU/JsZ9OFKs/24NHJXpsmCGsWEaUAEUQBkA7pjZFUeEHoPCJKASKIAnDNrnHuuW03w4kwmZzNcCLMPbftlhAVwfRsBWGtI3tKASKoCQP9ul+0HJIKLwidR0QpYIgAdJdOJiZIKrwgdB4RJaFv6HTKfVA9W0FYy4go9YjKO/bNEwPsv26LLGZdphsp9+LZCkJnEVHqAbV37DOpbM+LZNthrdXoNFtzJQhC7xBR6gG1d+zRkIbteD0tkm2VIHafWA5JTOg9a+1GRlh9JCW8B6yHVOK1WKMTxJT7fkKKjYVmEFHqAUEskm2VtSisUnPVW9bijYyw+kj4rgfUphLnCvaau2Nfq6EwSUzoHbKnJzSDeEo9oPaOfWQwtubu2CUUJrTKeogQCN1HPKUeUXnHPjGR5MKF+R5b1BpSoyO0ihQbC82waqJkGMZxIFf8H8C/NU3zW4Zh7Aa+AIwBU8AnTdN8fbXsEtpHQmFCK8iNjNAMq+0p3WWa5qs1xz4LfNo0zfsNw/gl4HPAratslyAIq4DcyAjL0dM9JcMwNgB7gS8VD30J2GsYxkTvrBIEQRB6xWp7Sl80DEMBngZ+B7gUOGWapgNgmqZjGMbp4vELq2zbukEKFAVBWKuspijdbJrmScMwIsB/B/4U+KNOvPDY2EDL50xMJDtx6Y7RKXtefO0cX37iCLquMDQQZiFn8eUnjjA0FOf6Kzauuj2dIEi2gNizHEGyJ0i2wMrsaWedW4sonuet+kUNw3g78DCwDzgMjBW9JA0/2eFtpmk24yldBhybmlrAdZt/H+1ku3XT++hk9t19D7y0qH4obzkMJ8Lc+4m9q27PSgmSLSD2LEeQ7AmSLbDYnomJpNLkqZfRxjoXZJZ676uyp2QYRsIwjKHifyvA3cArpmmeB14BPl586seBl5sUpFVjLbVHWYudFgRBEEqsVqLDRuBJwzAOAq8Cu4HfKD7268C
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 432x432 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(db.score.describe())\n",
|
|||
|
"sns.jointplot(x=db.score/100,y=db.num_comments/100,kind='reg');"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "a366340c-d4b8-4a42-a067-23c1cbdd74ba",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Both axes divided by 100 to make it easier to read\n",
|
|||
|
"\n",
|
|||
|
"Score is a strong predictor of number of comments, the highest scoring posts are almost guaranteed to have an above-median number of comments."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"id": "3179bf66-9c82-4e7a-82c0-c534c5fc6159",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:50.784680Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:50.784339Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:55.975570Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:55.975060Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:50.784655Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"count 48544.000000\n",
|
|||
|
"mean 59368.201209\n",
|
|||
|
"std 23001.003540\n",
|
|||
|
"min 14846.543459\n",
|
|||
|
"25% 41004.177545\n",
|
|||
|
"50% 57975.297003\n",
|
|||
|
"75% 76686.219161\n",
|
|||
|
"max 101274.168406\n",
|
|||
|
"Name: post_age, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAbMAAAGoCAYAAADM/AvfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAByeUlEQVR4nO39eZxk91nfi7/PUktX79ssGmk00khzRsIesBCMjI2xLWQPGDtgHLAsI8IaSG4S+L0SJ5ebmFzfwOUqEAhYGEPQjS5e5CBkW1jOIDO2wJKsASEp7bE1R5rW0tKsvXfXXmf5/fE9p7qqu6q7qrvWruf9es109zlVdb7nVNX5fJ/n+yya7/sIgiAIQjejt3sAgiAIgrBTRMwEQRCErkfETBAEQeh6RMwEQRCErkfETBAEQeh6zHYPoF3Mzq62JYxzdDTB4mK6HYfuCOT85fzl/Hd2/pOTg1qDhrOrEMusxZim0e4htBU5fzn/XqbXz7+ZiJgJgiAIXY+ImSAIQhMxIgaupuFqGgsr2eLvrqZhRMRSaxQ9u2YmCILQCvKOx1/+7TQA/f0xUqlccd9733YYkbPGIJaZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj4iZIAiC0PWImAmCIAhdj9nuAQhCO/A8n4LjUXA9HNej4JT+9Ck4rvrpejjh40oeU3A9XNfHD15P0zUcV/1lGhqxqEEsYhCLGvTHI0R1jf6+CFrEJFdwiZo6mqa17wIIwi5DxEzoGjzfJ5tzSGUd0lmHVLZQ9jOdc8jkHDI5l2zeIZt31d95l1zeWRMvx8fz/a0P2ERMQ2egz6S/L0J/PEJ/3GSgLxL8rbYPxNf+HhmIMZCIoIsACkJFRMyElpMruKQyBZKZAqlMgVTWIblOmJRgFYo/Q7HaTIMMXSMeM4lHDfqCnwOJKBOjyjrSNY2IqWOaOqahETEN4lEDQG039OLPRNzE9ynbFjHVc/769CvomoaurwnLj7ztMI98/SVAWX2O6+EGP992y9UkM+ocsnmX5WSOVNYhm3NYSeVJZQtcWcry8qVVUpkCBceren4jgzFGB2OMDMYYG4qzdzTBgcl+JoJtInZCryJiJuwYx/VYSeVZTuVZSuZYTqrfl1N5VtP5QLiUUCU3uVmDumEn4ib98QiJuMnwQIy9Y4ni34nQcolHeNa+QsTUiZpKcAxd470/cANf+tvpDa/7I287XL7d8ynkHd5927W1Pb5ke8Q0NmwvdRnqukZUX3vMgT2Dxdfq74/hFFxihsaPnzha+Rjffz1LyRypjFMU/aVkjqVkjjPn5lhcyXFxLkUm5+CViHvU1NkzmmDvWB/7xhLsGVU/944lGOyLiFtT2NWImAmbkiu4LKxkWVjJMb+SZWEly/xKlsXVnBKsZJ5kplDxuQN9EQYTylU2PhLn2r5B5pezaPhETIOoqRON6Pzg8Wv5xv+6UBSk0pvuelHJZQvksgW+721X8dqllaaff1vQNB5/9vyGze9/+w1ES6xB3/fJ5F3eZO3hwlyKywtpLi+keX02xXMvzuGWKF1fzGTfWB97xxLsG02wJxC8ieE++uOmCF2TyOYdvjWzxIuvL+F6PmPDfQz1mcSjcuttNHJFexjf90llHWaXMswvK5GaLxGu+eXsBqHSNBgZUK6ufeP93Hj1CMMDMYYGogz3RxkeiDE8EONv/+G1MjccKGH62j+8TiqVK9t+9Z5B+mLyUawXTdNIxExuPDiK/coCiajBdfsGuW7fIJ7v8/1vuprzV5JcXkwHQpfhhdeWOf2ty5R6a6OmzmjgthwN3JhjQ3FGg/d5dCgmll2dLCVzPPLkq3x96gL5Uk/Ea8toGhyY6OfmQ2PtG+AuRO4guxzf91lO5bmymOHyYprZpQyzS1kuLajf01mn7PGxqMH4UJyxoTjX7h9ibCjO+HCcsy/NF9ehQpEKrSbf81heybK8ki1uXy9kQmvRNY3RoThPPKcsvKG+CEMHItx4YIh3v/kQ84HAza/kWAgs7cXVHPZrSyyu5DYEyJiGVpzErP0rEb/BGCMDsZ5/333f5+tTF/ncV8+RL7jcdvNejr9xP2dfnkfXNBzghVcWeeXSCpcXM7zzew6yZyjW7mHvCkTMdgmr6TwX59NcnE9xeWFNuK4sZcgX1maGuqYxPhwHfPaMKBdTIliP+rF33MCp06+WzcA9x+X4d+xjdj7VhrMSmkF/X4RoZJD9ewY37NM0jS8+9iK5vEsmr6JCMzmXg/uHmF/Osria5ZVLqzz74tyGtU9d0xgdjKoJUDAhGhuKFf8eH4rhtzmKtJlcWcpw//88y/OvLnLkmhH+yQ8dZd9YAlfTODezCMBof4w3XD/GtfsGePniKtGIpPo2ChGzLsL3feZXslyYU6IVitfF+XSZO9A0dPaM9jE50sfRQ2Pq99EEe0b6GBuOEzENHv6bcxtePxEXV1IvkHe8ioEnoKxqTQuiQmMmECtu/9LfTrN3JA6oz2LB8XjLdx3gymKGxVXlnl5YzrKwmmX6wgp/f/ZK2bodQF/MYHRQidz4UJyxwTWxGxtWrs2I2V03eM/z+eunX+Ohv30JXde4+90Wb/uuqzaNLB1MRDl2eJzBRJRNQ3SFmhEx61A83+fKYoZXL63y6uVVXruS5JWLK6RK3IIDfREOTPbzJmuS/eP97BtPsH+8n7HhOIauFwUrmcqTTOV5+fUlQN2YBGEnaJpGNGKwd7yfv//WJQBMDfaMxNkzEuf977yRXN5lJZUPAojUv5V0gSsLaRZWsrx6aZXV9MbgoeH+0LqLBdZd+e9Dic6ZdJ17fZnPnnqBly+ucuzwOHe/22JsKN7uYfUkImYdguf5vHYlyYsXlvn2K4uce22paG2ZhsaByQEmhuNcf1WUof4oA30RYhGjOGPOZgu8cn6ZV84vAyJYQnvJOx6PPP7Shu3/+PYjPPzYi1wz2Q+A63pk8i633ryXK4uZwJWpLLzXZ1NMTc+XB1CgPA+llt3IYIzh/igjA7HgX5ThgWjFFIpGcXkhzV/8zTRP27MMD0T5xffezPGb93aMyPYiImZtJJNz+NbLCzx3bo6p6fmieCXiJuNDcQ4fGGJkQEWSve/tlfOnBKGbMQydgT6dG64Z5ezLC2jA2ECUsYEoHBjix95xA0ura9bd/EqWxeD3xdUc3351kZVUfoM7EyhWThkeiJb9XBO8GCP9UaKR2kRvOZnjzMsL/N3zVzjz0jzRiMGPvvU63v29B4lFmyecQm2ImLWYbN7hqW9d4qnnL/OtlxZwPZ9E3OSNhyd4w/XjWNeO8vizr7d7mILQERRcn689PVO2LWbq7B9L8As/+kYVTev75Ase2YIKWPmOwxPML2dZTuZYSuZZTua4NLPIcrKy6CVC0euP0h83iUYMIqaO43iqWk3W4fJimoUVlVIyOhjjvW85xDvedIDhAYlE7BREzFqA7/ucnVniyW9e5JkXZ8nkXMaG4hzaN8i+sQSjQ6oM0exCiuNv2Nfu4QpCV6FpQWHnqMFwf5Tb3rCfL/3ttEpH6ItA4NL84e+/ns9/9UWyQa3ObN4lm3c5sGeQhZVQ/HLkHY98wSUaUYn9iZjJkWtGOLhnEOvgCIf2DYo7sQMRMWsiubzLU2cv8+jfvcbFuRR9MYPvuXkf33vzXo4cHBW3oSC0EF3TVCeDiAH90eL2cN05XMcLef87b6xYes00ddyCu2G7ETE2rO8BInwtQsSsCVxZyvDVf3idr09dJJNzGO6P8l03TnBgPMHQUB8vvLLAkYOj7R6mIAibUC2F4f3vvBG3gkB5rs9fVqnnKTQfEbMG4fs+335lkVP/8Dr/69wcuq7x3dYk77z1IN+anpXZmSDsEqqJnIhWexEx2yH
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 432x432 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(db.post_age.describe())\n",
|
|||
|
"sns.jointplot(x=db.post_age,y=db.num_comments, kind='reg');"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "0d9922a6-e7cf-4513-8dca-0bfd1aa33447",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"It could be clearer, but there is some correlation with age and post count. You can also see how sparse super hot posts actually are."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"id": "b0c00c9b-0e22-411f-8fdf-13c06d2ae877",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:55.976540Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:55.976347Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:55.986845Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:55.986201Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:55.976523Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>title</th>\n",
|
|||
|
" <th>subreddit</th>\n",
|
|||
|
" <th>num_comments</th>\n",
|
|||
|
" <th>score</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>38037</th>\n",
|
|||
|
" <td>to push past armed courtroom guards</td>\n",
|
|||
|
" <td>therewasanattempt</td>\n",
|
|||
|
" <td>5876</td>\n",
|
|||
|
" <td>72261</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>36183</th>\n",
|
|||
|
" <td>Avs VS Oilers Western Conference Finals Game 4...</td>\n",
|
|||
|
" <td>ColoradoAvalanche</td>\n",
|
|||
|
" <td>5851</td>\n",
|
|||
|
" <td>300</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>33841</th>\n",
|
|||
|
" <td>Live WWE RAW Discussion Thread! - June 6th, 2022!</td>\n",
|
|||
|
" <td>SquaredCircle</td>\n",
|
|||
|
" <td>5682</td>\n",
|
|||
|
" <td>128</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>36694</th>\n",
|
|||
|
" <td>What would you never tell your current partner?</td>\n",
|
|||
|
" <td>AskMen</td>\n",
|
|||
|
" <td>5679</td>\n",
|
|||
|
" <td>10159</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1181</th>\n",
|
|||
|
" <td>Nearly half of families with kids can no longe...</td>\n",
|
|||
|
" <td>news</td>\n",
|
|||
|
" <td>5464</td>\n",
|
|||
|
" <td>60516</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>28900</th>\n",
|
|||
|
" <td>Ohio House Republicans vote to put Canada on a...</td>\n",
|
|||
|
" <td>nottheonion</td>\n",
|
|||
|
" <td>5389</td>\n",
|
|||
|
" <td>53999</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>39946</th>\n",
|
|||
|
" <td>Elon Musk asserts his \"right to terminate\" Twi...</td>\n",
|
|||
|
" <td>technology</td>\n",
|
|||
|
" <td>5369</td>\n",
|
|||
|
" <td>28363</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>28235</th>\n",
|
|||
|
" <td>Prince Louis of Cambridge</td>\n",
|
|||
|
" <td>funny</td>\n",
|
|||
|
" <td>5334</td>\n",
|
|||
|
" <td>104033</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>32875</th>\n",
|
|||
|
" <td>\"Everybody is trying to blame us\"</td>\n",
|
|||
|
" <td>PublicFreakout</td>\n",
|
|||
|
" <td>5230</td>\n",
|
|||
|
" <td>89886</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7931</th>\n",
|
|||
|
" <td>My GF’s friend stayed with us last weekend (fo...</td>\n",
|
|||
|
" <td>iamverybadass</td>\n",
|
|||
|
" <td>5173</td>\n",
|
|||
|
" <td>24474</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" title subreddit \\\n",
|
|||
|
"38037 to push past armed courtroom guards therewasanattempt \n",
|
|||
|
"36183 Avs VS Oilers Western Conference Finals Game 4... ColoradoAvalanche \n",
|
|||
|
"33841 Live WWE RAW Discussion Thread! - June 6th, 2022! SquaredCircle \n",
|
|||
|
"36694 What would you never tell your current partner? AskMen \n",
|
|||
|
"1181 Nearly half of families with kids can no longe... news \n",
|
|||
|
"28900 Ohio House Republicans vote to put Canada on a... nottheonion \n",
|
|||
|
"39946 Elon Musk asserts his \"right to terminate\" Twi... technology \n",
|
|||
|
"28235 Prince Louis of Cambridge funny \n",
|
|||
|
"32875 \"Everybody is trying to blame us\" PublicFreakout \n",
|
|||
|
"7931 My GF’s friend stayed with us last weekend (fo... iamverybadass \n",
|
|||
|
"\n",
|
|||
|
" num_comments score \n",
|
|||
|
"38037 5876 72261 \n",
|
|||
|
"36183 5851 300 \n",
|
|||
|
"33841 5682 128 \n",
|
|||
|
"36694 5679 10159 \n",
|
|||
|
"1181 5464 60516 \n",
|
|||
|
"28900 5389 53999 \n",
|
|||
|
"39946 5369 28363 \n",
|
|||
|
"28235 5334 104033 \n",
|
|||
|
"32875 5230 89886 \n",
|
|||
|
"7931 5173 24474 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"db[db.num_comments>4000].sort_values('num_comments',ascending=False)[['title','subreddit','num_comments','score']].head(10)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "f8f94abd-f480-4ddc-9ec3-74e0f5e80b53",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"I expected to see mostly news here, and while news and current events is here (30% in my case at time of writing) the rest of it is drastically different. Anything can go viral it seems"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"id": "fde30375-0763-4d2c-88e3-5a5c516cb145",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:55.988879Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:55.988164Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:25:56.001194Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:25:56.000621Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:55.988838Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>title</th>\n",
|
|||
|
" <th>subreddit</th>\n",
|
|||
|
" <th>num_comments</th>\n",
|
|||
|
" <th>score</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>46529</th>\n",
|
|||
|
" <td>More of this please.</td>\n",
|
|||
|
" <td>MadeMeSmile</td>\n",
|
|||
|
" <td>5146</td>\n",
|
|||
|
" <td>157009</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>47421</th>\n",
|
|||
|
" <td>Helicopter footage of a loose cow being wrangl...</td>\n",
|
|||
|
" <td>interestingasfuck</td>\n",
|
|||
|
" <td>4347</td>\n",
|
|||
|
" <td>126608</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>28235</th>\n",
|
|||
|
" <td>Prince Louis of Cambridge</td>\n",
|
|||
|
" <td>funny</td>\n",
|
|||
|
" <td>5334</td>\n",
|
|||
|
" <td>104033</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16638</th>\n",
|
|||
|
" <td>This $10 salad I paid for at a restaurant</td>\n",
|
|||
|
" <td>mildlyinfuriating</td>\n",
|
|||
|
" <td>4336</td>\n",
|
|||
|
" <td>103858</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>32875</th>\n",
|
|||
|
" <td>\"Everybody is trying to blame us\"</td>\n",
|
|||
|
" <td>PublicFreakout</td>\n",
|
|||
|
" <td>5230</td>\n",
|
|||
|
" <td>89886</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>22957</th>\n",
|
|||
|
" <td>Even the military knows assault rifles belong ...</td>\n",
|
|||
|
" <td>WhitePeopleTwitter</td>\n",
|
|||
|
" <td>4047</td>\n",
|
|||
|
" <td>80801</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>38037</th>\n",
|
|||
|
" <td>to push past armed courtroom guards</td>\n",
|
|||
|
" <td>therewasanattempt</td>\n",
|
|||
|
" <td>5876</td>\n",
|
|||
|
" <td>72261</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1181</th>\n",
|
|||
|
" <td>Nearly half of families with kids can no longe...</td>\n",
|
|||
|
" <td>news</td>\n",
|
|||
|
" <td>5464</td>\n",
|
|||
|
" <td>60516</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19201</th>\n",
|
|||
|
" <td>It Costs $110,000 to Fully Gear-Up in Diablo I...</td>\n",
|
|||
|
" <td>technology</td>\n",
|
|||
|
" <td>5150</td>\n",
|
|||
|
" <td>58244</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11976</th>\n",
|
|||
|
" <td>Electric Vehicles are measurably reducing glob...</td>\n",
|
|||
|
" <td>technology</td>\n",
|
|||
|
" <td>4120</td>\n",
|
|||
|
" <td>55080</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" title subreddit \\\n",
|
|||
|
"46529 More of this please. MadeMeSmile \n",
|
|||
|
"47421 Helicopter footage of a loose cow being wrangl... interestingasfuck \n",
|
|||
|
"28235 Prince Louis of Cambridge funny \n",
|
|||
|
"16638 This $10 salad I paid for at a restaurant mildlyinfuriating \n",
|
|||
|
"32875 \"Everybody is trying to blame us\" PublicFreakout \n",
|
|||
|
"22957 Even the military knows assault rifles belong ... WhitePeopleTwitter \n",
|
|||
|
"38037 to push past armed courtroom guards therewasanattempt \n",
|
|||
|
"1181 Nearly half of families with kids can no longe... news \n",
|
|||
|
"19201 It Costs $110,000 to Fully Gear-Up in Diablo I... technology \n",
|
|||
|
"11976 Electric Vehicles are measurably reducing glob... technology \n",
|
|||
|
"\n",
|
|||
|
" num_comments score \n",
|
|||
|
"46529 5146 157009 \n",
|
|||
|
"47421 4347 126608 \n",
|
|||
|
"28235 5334 104033 \n",
|
|||
|
"16638 4336 103858 \n",
|
|||
|
"32875 5230 89886 \n",
|
|||
|
"22957 4047 80801 \n",
|
|||
|
"38037 5876 72261 \n",
|
|||
|
"1181 5464 60516 \n",
|
|||
|
"19201 5150 58244 \n",
|
|||
|
"11976 4120 55080 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"db[db.num_comments>4000].sort_values('score',ascending=False)[['title','subreddit','num_comments','score']].head(10)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "523f31c6-7fdf-4079-8621-32574d400879",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Sorting by score is a similar story. This is where machine learning really blows my mind, there's no apparent correlation here but somehow it manages to find it"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"id": "75749905-250d-41aa-90fc-16cd06be8e73",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:25:56.002307Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:25:56.001959Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:26:01.321549Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:26:01.321049Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:25:56.002277Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"count 48544.000000\n",
|
|||
|
"mean 0.963388\n",
|
|||
|
"std 0.041302\n",
|
|||
|
"min 0.510000\n",
|
|||
|
"25% 0.950000\n",
|
|||
|
"50% 0.980000\n",
|
|||
|
"75% 0.990000\n",
|
|||
|
"max 1.000000\n",
|
|||
|
"Name: upvote_ratio, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAbMAAAGoCAYAAADM/AvfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAABsPUlEQVR4nO39ebwk9Xnfi79r6eXs+wwDwzAweL6DhJCMhUESlhAymEgRchwcS8ImN7Idy47jSL97g/NybMVxYr9s7FzFjmRLskOCg5Du9USWsJExmMUyIOaHDHgEYgrNoGGYfc6+9OmllvtHVffppbpP1zm99/N+vWbOOVXdVd9vL/Wp5/k+i+Z5HoIgCILQzejtHoAgCIIgbBcRM0EQBKHrETETBEEQuh4RM0EQBKHrETETBEEQuh6z3QNoFxcurNQM45yYGGRhIdWq4bScXp8f9P4cZX7dz1bmODMzojVpOF2NWGZVME2j3UNoKr0+P+j9Ocr8up9+mGOrEDETBEFoIprm/xOai4iZIAhCk9A0ePS5N3j0uTdE0JpM366ZCYIgtIKs7bR7CH2BWGaCIAhC1yNiJgiCIHQ9ImaCIAhC1yNiJgiCIHQ9ImaCIAhC1yNiJgiCIHQ9ImaCIAhC1yNiJgiCIHQ9ImaCIAhC1yNiJgiCIHQ9ImaCIAgNQGovthcRM0EQhG2iafC3L54WQWsjImaCIAgNQAoKtxcRM0EQBKHrETETBEEQuh4RM0EQBKHrETETBEEQuh4RM0EQBKHrETETBEEQuh4RM0EQBKHrETETBEEQuh4RM0EQBKHrETETBEEQuh4RM0EQhIhomhQW7jREzARBECKgafDoc2/w6HNviKB1EGa7ByAIgtBtSFHhzkMsM0EQBKHrETETBEEQuh4RM0EQBKHrETETBEEQuh4RM0EQBKHrETETBEGoQrNC7yVPrfGImAmCIISgafC3L55uuOgU56kJjUPyzARBEKrQrHwyyVNrPGKZCYIgCF2PiJkgCILQ9YiYCYIgIAEZ3Y6ImSAIfU+zgj3OL6Q4O59ibT3X2AMLFUgAiCAIAo0NyriwuM5X/+41vvnyOQAMXUPtmeCafdMNO4dQioiZIAh9haaB5zX+uLbj8uQLp/j7Vy/w8mvzGIbGB95xGcupLC8dm+MPDn6bf/uRt3HgsonGn1wQN6MgCP1DM9yJnufx9LfP8BdPHee+hy3Ozaf4wDsv43c+/k7uuGkfOycGePfbLmZ6LMl/f+gV1jN2404uFBDLTBCEvqKR7sQ3zq9y8MljHD42x9RYkk/8s7dy5SVjaGVqGTN1fvaDb+K37v977nvYYvfMYMVjhO0hYiYIQs/SDJfi4mqGI68vcGo2xcEnXyMZN/iJm69E02H/peNVz3fl7jH+yQ9dwVe+8Rrr+ybZf+l4YwfW54iYCYLQ9YSJVt6l+J63XbxtQVvP2Hzn+ALPvnyWF4/O4rgek6MJfvy9+3j3Wy9meCDGo9/avDzVB95xGcfPrvD8qxfEMmswImaCIHQNUUVrqy7FpdUMr5xY4NmXz/E33zrJhYV1PGB4IMYt112K57mMDMZ5/w2XRRJKTdP4udvfxH/8n8/xD0fnOHpykbGEsaUxCqWImAmC0FbqdQU2WrRytstKKsuFxXVOz67x3JHzfPvYHGfmUswupQGImzpXXzHFO998EfsvHefK3WPETJ2Hvnk88vnyxGMG73jzTs7Or3P5rlHm59e2fCxhAxEzQRAaTjWBKt9eTaA0Db7+9Pf4QTVdsr1ctDzPw3E9crbLcipLLueSc1xSaZvltSzHTi2RyTosrWZZWsuynMpyZm6Nr/3d91hLl0YVxkydi6eHuHzXKDdfu5urLhvHOrnIj1y3p+HrbpqmsWtqEMOQgPJGIWIm9CSe5xX+Veyr+cRau8J3bvVCV/15pTuKBSD/U9Mgk3PI5pyyZ3l4HriuP17PK/rperhe/rXZmJHrBs8JDu55+cdtPDa/zw3+9op+EhzDDZ6LB4dfm+PqyydxXT//ynZ8kXnl+AJ7d42Qs11ytovtehw7tcTrZ1fIOS627RZ+XlhK8zf//9f959outuOxnMryl8+8XvK4/Kv1Z08cC301D3GegYTB6FCCsaEYY0MJvm/3GKNDccaG4kyNJrlkZohvWee5tUi4NA2+e2qpvjdTaDv9Kmamrm+++Fr8mJX1HJ/534er5ohs5YJW6ynVLpy1nljzeCHrDBvbtnDACOcqPWT0czUhv1VoAYePzYVuf+3McuF3Q9fQNI3juRVMU8fUNWKmjmHojI8kGB+KYRg6McPftriaYdfkIIapE9N1TFPDNHROza6x7+IxTEPDNDSSCZPRwTjfPbnEu9+6CzOwgDQNnn35HDe8eWeFJTgxksQwtBIxGx9OlGwrfixQ2Bf1sfltUHqdqZO9wElAEtaK0MLuXPuAvcD32j0IQRCELXI5cLzdg+gk+lXMTGB3uwchCIKwRcQyK6NfxUwQBEHoISSURhAEQeh6RMwEQRCErkfETBAEQeh6RMwEQRCErkfETBAEQeh6RMwEQRCErqdfxczET5zu1woogiD0Pn11neuLSYawG/je3NwqrhueZzcxMcjCQqq1o2ohvT4/6P05yvy6n63McWZmpN76V5te57qNWnPvV8tsU0yzt3sM9fr8oPfnKPPrfvphjq1CxEwQBEHoekTMBEEQhK5HxEwQBEHoeloWAKKUSgKfBn4YSAPftCzrXyql9gP3AVPAHHCXZVnfDZ6zpX2CIAhCf9FKy+wefBHbb1nWW4BfC7Z/DvisZVn7gc8Cny96zlb3CYIgCH1ESywzpdQwcBew27IsD8CyrHNKqR3AtcAtwUO/BHxGKTUDaFvZZ1nWhVbMSRAEQegcWmWZ7cN3Bf4HpdS3lFJPKqVuBC4FTlmW5QAEP08H27e6TxAEQegzWrVmZgJXAC9YlvVvlVLXA38B/HiLzh/K1NRwzf0zMyMtGkl76PX5Qe/PUebX/TR7jptd53qFVonZ6/gtvr8EYFnWIaXULLAOXKKUMizLcpRSBnAx8Aa+K3Er++qmVmb8zMwIFy6sbGmy3UCvzw96f44yv+5nK3OMKn49VgGk6r6WuBkty5oFniBY4woiEXcArwIvAh8JHvoRfOvtgmVZ57eyr+mTEQRBEDqOVtZm/Dhwr1LqvwA54Kcsy1pUSn0cuE8p9SlgAT9QpPg5W9knCEIHcfjYLA8fOsHsUprpsSS3Xb+Ha/ZNt3tYQg/RMjGzLOs14KaQ7UeA66s8Z0v7BEHoHA4fm+WLj76KYegMJk0W17J88dFXAZomaCKe/YdUABEEoak8fOgEhqGTiBlomkYiZmAYOg8fOtGU8+XFc3EtWyKeh4/NNuV8QmcgYiYIQlOZXUoTN0svNXFTZ3Yp3ZTztVo8hc5AxEwQhKYyPZYka7sl27K2y/RYsinna7V4Cp2BiJkgCE3ltuv34DgumZyD53lkcg6O43Lb9Xuacr5Wi6fQGYiYCYLQVK7ZN82dt+xnfChOKm0zPhTnzlv2Ny0go9XiKXQGrQzNFwShT7lm33TLognz55Foxv5CxEwQhJ6jleIpdAYiZoLQI/RrblW/zlsoRcRMEHqAdiQmdwL9Om+hEgkAEYQeoF9zq/p13kIlImaC0AP0a25Vv85bqETETBB6gH7NrerXeQuViJgJQg/Qr7lV/TpvoRIJABGEHqBfc6v6dd5CJSJmgtAj9GtuVb/OWyhF3IyCIAhC1yNiJgiCIHQ94mYUBKFhSDUOoV2ImAmC0BCkGofQTsTNKAhCQ5BqHEI7ETETBKEhSDUOoZ2Im1EQhIYwPZZkcS1LImYUtuWrcchamtBsRMwEQahJvUJ02/V7+OKjr5LBt8iytovjuBzYMy5raULTETejIAhVyQd1LK5lS4To8LHZisdes2+aO2/Zz/hQnFTaZnwozp237OfIiUVZSxOajlhmgtChdIJrrjioAyARM8gE28PGElaN4/5HXmUwWXqpkbU0odGIZSYIHUgUi6iZNCKoQyrbC61AxEwQOpBOCXNvhBBJZXuhFYiYCUIH0ilh7o0QompraRL8ITQ
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 432x432 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(db.upvote_ratio.describe())\n",
|
|||
|
"sns.jointplot(x=db.upvote_ratio,y=db.num_comments, kind='reg');"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "9fcffe77-4aa6-4df3-a4bd-bc020a112c12",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Upvote ratio goes down as the number of comments increases, at first I thought this seemed backwards but I guess the majority of reddit comments are of such low quality that it drives the metric down into the dirt"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "af0cc9fa-7ba4-4579-b972-91102ea8ba0e",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Create Working Dataframes"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "92c7db6d-1915-4161-9ffc-7bb6763cf064",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Let's put it in a dataframe so we can work with it. And also drop any NAs that appear (I don't see any, but I don't want any if they appear in the future)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"id": "20e22840-02f1-4d8e-8ee3-73835863f4a6",
|
|||
|
"metadata": {
|
|||
|
"execution": {
|
|||
|
"iopub.execute_input": "2022-06-07T15:26:01.323685Z",
|
|||
|
"iopub.status.busy": "2022-06-07T15:26:01.323365Z",
|
|||
|
"iopub.status.idle": "2022-06-07T15:26:01.619933Z",
|
|||
|
"shell.execute_reply": "2022-06-07T15:26:01.619372Z",
|
|||
|
"shell.execute_reply.started": "2022-06-07T15:26:01.323665Z"
|
|||
|
},
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"db[['title','subreddit','over_18','is_original_content','is_self','spoiler',\\\n",
|
|||
|
" 'locked','stickied','num_comments']].dropna().to_csv('data/workingdf.csv',index=False)\n",
|
|||
|
"\n",
|
|||
|
"db[['score','post_age','upvote_ratio']].to_csv('data/numerics.csv',index=False)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "154b0f51-a46a-47a1-b73b-0d7f1f1f8f57",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Next, we [clean](clean.ipynb)"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3 (ipykernel)",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.10.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 5
|
|||
|
}
|