mpg/clean.ipynb

808 lines
25 KiB
Text
Raw Normal View History

2022-07-21 16:31:53 -04:00
{
"cells": [
{
"cell_type": "markdown",
"id": "9151a000-1923-408b-bd86-16008dc95f97",
"metadata": {},
"source": [
"[readme](readme.md)"
]
},
{
"cell_type": "markdown",
"id": "cecbac86-abb3-4f6b-a101-2d9324d96274",
"metadata": {},
"source": [
"# Cleaning"
]
},
{
"cell_type": "markdown",
"id": "b67cb510-2df0-4ce4-a033-473710fdc749",
"metadata": {},
"source": [
"Load file and set column names"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3c4bfade-d06d-4887-9eb4-ec7f5bc61625",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:36.887038Z",
"iopub.status.busy": "2022-07-21T20:29:36.886672Z",
"iopub.status.idle": "2022-07-21T20:29:37.222976Z",
"shell.execute_reply": "2022-07-21T20:29:37.222218Z",
"shell.execute_reply.started": "2022-07-21T20:29:36.886962Z"
},
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv('data/auto-mpg.data',header=None,delim_whitespace=True)\n",
"df.columns = ['mpg','cylinders','displacement','horsepower','weight',\n",
" 'acceleration','model_year','origin','car_name']"
]
},
{
"cell_type": "markdown",
"id": "fdcec7e3-c65e-4d66-9a10-b500fb940234",
"metadata": {},
"source": [
"Attribute Information:\n",
"\n",
" 1. mpg: continuous\n",
" 2. cylinders: multi-valued discrete\n",
" 3. displacement: continuous\n",
" 4. horsepower: continuous\n",
" 5. weight: continuous\n",
" 6. acceleration: continuous\n",
" 7. model year: multi-valued discrete\n",
" 8. origin: multi-valued discrete\n",
" 9. car name: string (unique for each instance)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "62bbb6bd-b5b3-4d54-a132-23cd367c4570",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.225459Z",
"iopub.status.busy": "2022-07-21T20:29:37.224901Z",
"iopub.status.idle": "2022-07-21T20:29:37.237624Z",
"shell.execute_reply": "2022-07-21T20:29:37.236773Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.225432Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 398 entries, 0 to 397\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 mpg 398 non-null float64\n",
" 1 cylinders 398 non-null int64 \n",
" 2 displacement 398 non-null float64\n",
" 3 horsepower 398 non-null object \n",
" 4 weight 398 non-null float64\n",
" 5 acceleration 398 non-null float64\n",
" 6 model_year 398 non-null int64 \n",
" 7 origin 398 non-null int64 \n",
" 8 car_name 398 non-null object \n",
"dtypes: float64(4), int64(3), object(2)\n",
"memory usage: 28.1+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"id": "6a4028ed-eda3-4c50-aed0-d9503d41a8e1",
"metadata": {},
"source": [
"Why is horsepower not a number?"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "58fa2876-4ccb-4ef5-bc16-d25b74efb457",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.239126Z",
"iopub.status.busy": "2022-07-21T20:29:37.238760Z",
"iopub.status.idle": "2022-07-21T20:29:37.252035Z",
"shell.execute_reply": "2022-07-21T20:29:37.251217Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.239098Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',\n",
" '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',\n",
" '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',\n",
" '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',\n",
" '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',\n",
" '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',\n",
" '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',\n",
" '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',\n",
" '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',\n",
" '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',\n",
" '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',\n",
" '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',\n",
" '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',\n",
" '116.0', '82.00'], dtype=object)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.horsepower.unique()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2d99ea58-ca51-4461-a127-c6b389b056a1",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.253416Z",
"iopub.status.busy": "2022-07-21T20:29:37.253082Z",
"iopub.status.idle": "2022-07-21T20:29:37.271785Z",
"shell.execute_reply": "2022-07-21T20:29:37.271054Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.253389Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mpg</th>\n",
" <th>cylinders</th>\n",
" <th>displacement</th>\n",
" <th>horsepower</th>\n",
" <th>weight</th>\n",
" <th>acceleration</th>\n",
" <th>model_year</th>\n",
" <th>origin</th>\n",
" <th>car_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>25.0</td>\n",
" <td>4</td>\n",
" <td>98.0</td>\n",
" <td>?</td>\n",
" <td>2046.0</td>\n",
" <td>19.0</td>\n",
" <td>71</td>\n",
" <td>1</td>\n",
" <td>ford pinto</td>\n",
" </tr>\n",
" <tr>\n",
" <th>126</th>\n",
" <td>21.0</td>\n",
" <td>6</td>\n",
" <td>200.0</td>\n",
" <td>?</td>\n",
" <td>2875.0</td>\n",
" <td>17.0</td>\n",
" <td>74</td>\n",
" <td>1</td>\n",
" <td>ford maverick</td>\n",
" </tr>\n",
" <tr>\n",
" <th>330</th>\n",
" <td>40.9</td>\n",
" <td>4</td>\n",
" <td>85.0</td>\n",
" <td>?</td>\n",
" <td>1835.0</td>\n",
" <td>17.3</td>\n",
" <td>80</td>\n",
" <td>2</td>\n",
" <td>renault lecar deluxe</td>\n",
" </tr>\n",
" <tr>\n",
" <th>336</th>\n",
" <td>23.6</td>\n",
" <td>4</td>\n",
" <td>140.0</td>\n",
" <td>?</td>\n",
" <td>2905.0</td>\n",
" <td>14.3</td>\n",
" <td>80</td>\n",
" <td>1</td>\n",
" <td>ford mustang cobra</td>\n",
" </tr>\n",
" <tr>\n",
" <th>354</th>\n",
" <td>34.5</td>\n",
" <td>4</td>\n",
" <td>100.0</td>\n",
" <td>?</td>\n",
" <td>2320.0</td>\n",
" <td>15.8</td>\n",
" <td>81</td>\n",
" <td>2</td>\n",
" <td>renault 18i</td>\n",
" </tr>\n",
" <tr>\n",
" <th>374</th>\n",
" <td>23.0</td>\n",
" <td>4</td>\n",
" <td>151.0</td>\n",
" <td>?</td>\n",
" <td>3035.0</td>\n",
" <td>20.5</td>\n",
" <td>82</td>\n",
" <td>1</td>\n",
" <td>amc concord dl</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mpg cylinders displacement horsepower weight acceleration \\\n",
"32 25.0 4 98.0 ? 2046.0 19.0 \n",
"126 21.0 6 200.0 ? 2875.0 17.0 \n",
"330 40.9 4 85.0 ? 1835.0 17.3 \n",
"336 23.6 4 140.0 ? 2905.0 14.3 \n",
"354 34.5 4 100.0 ? 2320.0 15.8 \n",
"374 23.0 4 151.0 ? 3035.0 20.5 \n",
"\n",
" model_year origin car_name \n",
"32 71 1 ford pinto \n",
"126 74 1 ford maverick \n",
"330 80 2 renault lecar deluxe \n",
"336 80 1 ford mustang cobra \n",
"354 81 2 renault 18i \n",
"374 82 1 amc concord dl "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.horsepower == '?']"
]
},
{
"cell_type": "markdown",
"id": "498d069d-b95e-43d6-bd3d-4b707fdd9635",
"metadata": {},
"source": [
"I'll fill in what I can find online"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e53a2eaf-a8f9-4d7e-bf8b-07a125cf6f06",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.273324Z",
"iopub.status.busy": "2022-07-21T20:29:37.272853Z",
"iopub.status.idle": "2022-07-21T20:29:37.278574Z",
"shell.execute_reply": "2022-07-21T20:29:37.277496Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.273297Z"
},
"tags": []
},
"outputs": [],
"source": [
"# 1971 pinto kent I4\n",
"df.at[32,'horsepower'] = '75.0'\n",
"# 1974 maverick 200 I6\n",
"df.at[126,'horsepower'] = '85.0'\n",
"# 1980 renault lecar deluxe 85ci I4\n",
"df.at[330,'horsepower'] = '53.5'\n",
"# 1980 ford mustang cobra\n",
"# they seem confused between 2 different models\n",
"# 1981 renault 18i\n",
"df.at[354,'horsepower'] = '81.5'\n",
"#1982 AMC concord dl 151\n",
"df.at[374,'horsepower'] = '90'"
]
},
{
"cell_type": "markdown",
"id": "68d959c5-9628-437f-8f3f-0b4c7002b1f0",
"metadata": {},
"source": [
"We'll ignore the mustang because it's too far off from realistic, it looks like they got confused between two different models.\n",
"\n",
"Anyway, drop all '?' horsepower"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "10400330-e6aa-43e0-910f-f97869c23d0f",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.280095Z",
"iopub.status.busy": "2022-07-21T20:29:37.279777Z",
"iopub.status.idle": "2022-07-21T20:29:37.286985Z",
"shell.execute_reply": "2022-07-21T20:29:37.286202Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.280060Z"
},
"tags": []
},
"outputs": [],
"source": [
"df.drop(df[df.horsepower == '?'].index,inplace=True)\n",
"df['horsepower'] = df.horsepower.astype(float)\n",
"df.reset_index(inplace=True,drop=True)"
]
},
{
"cell_type": "markdown",
"id": "b2afc76d-c428-4b81-9882-5ea19ecd04bb",
"metadata": {},
"source": [
"And set to floats, like the rest"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e0fd9a7b-6cdf-4346-8c8d-6c5f36e167f6",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.289881Z",
"iopub.status.busy": "2022-07-21T20:29:37.289472Z",
"iopub.status.idle": "2022-07-21T20:29:37.301335Z",
"shell.execute_reply": "2022-07-21T20:29:37.300537Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.289852Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 397 entries, 0 to 396\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 mpg 397 non-null float64\n",
" 1 cylinders 397 non-null int64 \n",
" 2 displacement 397 non-null float64\n",
" 3 horsepower 397 non-null float64\n",
" 4 weight 397 non-null float64\n",
" 5 acceleration 397 non-null float64\n",
" 6 model_year 397 non-null int64 \n",
" 7 origin 397 non-null int64 \n",
" 8 car_name 397 non-null object \n",
"dtypes: float64(5), int64(3), object(1)\n",
"memory usage: 28.0+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"id": "097c4cec-eb77-46f6-8eef-89eb7c47b425",
"metadata": {},
"source": [
"Looks good"
]
},
{
"cell_type": "markdown",
"id": "151e5f1b-6409-4972-9c79-a26d132eedf5",
"metadata": {},
"source": [
"### Min/Max to check range"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "769f33e7-2f2e-46e8-b6dd-8f8fb79d13b7",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.303380Z",
"iopub.status.busy": "2022-07-21T20:29:37.302680Z",
"iopub.status.idle": "2022-07-21T20:29:37.310508Z",
"shell.execute_reply": "2022-07-21T20:29:37.309738Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.303336Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mpg\n",
"Min: 9.0 \n",
"Max: 46.6\n",
"\n",
"cylinders\n",
"Min: 3 \n",
"Max: 8\n",
"\n",
"displacement\n",
"Min: 68.0 \n",
"Max: 455.0\n",
"\n",
"horsepower\n",
"Min: 46.0 \n",
"Max: 230.0\n",
"\n",
"weight\n",
"Min: 1613.0 \n",
"Max: 5140.0\n",
"\n",
"acceleration\n",
"Min: 8.0 \n",
"Max: 24.8\n",
"\n",
"model_year\n",
"Min: 70 \n",
"Max: 82\n",
"\n",
"origin\n",
"Min: 1 \n",
"Max: 3\n",
"\n"
]
}
],
"source": [
"for col in df.columns[:-1]:\n",
" print(f'''{col}\n",
"Min: {df[col].min()} \n",
"Max: {df[col].max()}\n",
"''')"
]
},
{
"cell_type": "markdown",
"id": "59641984-e266-4eaa-a90d-af266cb95936",
"metadata": {},
"source": [
"All of this makes sense"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7bac1a71-53d2-4081-b566-244bccd3a3c6",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.312020Z",
"iopub.status.busy": "2022-07-21T20:29:37.311631Z",
"iopub.status.idle": "2022-07-21T20:29:37.319881Z",
"shell.execute_reply": "2022-07-21T20:29:37.318953Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.311992Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"array(['datsun pl510', 'amc gremlin', 'chevrolet chevelle malibu',\n",
" 'chevrolet impala', 'ford galaxie 500', 'plymouth fury iii',\n",
" 'pontiac catalina', 'amc matador', 'amc hornet', 'ford maverick',\n",
" 'plymouth duster', 'chevrolet vega', 'ford pinto',\n",
" 'toyota corolla 1200', 'ford gran torino', 'ford gran torino (sw)',\n",
" 'amc matador (sw)', 'opel manta', 'toyota corona', 'fiat 128',\n",
" 'chevrolet nova', 'ford ltd', 'volkswagen dasher', 'datsun 710',\n",
" 'audi 100ls', 'peugeot 504', 'saab 99le', 'opel 1900',\n",
" 'dodge colt', 'chevrolet chevelle malibu classic',\n",
" 'plymouth valiant', 'honda civic', 'volkswagen rabbit',\n",
" 'toyota corolla', 'toyota mark ii', 'chevrolet caprice classic',\n",
" 'chevrolet chevette', 'honda civic cvcc', 'chevrolet malibu',\n",
" 'chevrolet monte carlo landau', 'buick estate wagon (sw)',\n",
" 'ford country squire (sw)', 'oldsmobile cutlass salon brougham',\n",
" 'vw rabbit', 'chevrolet citation', 'amc concord', 'dodge aspen',\n",
" 'datsun 210', 'subaru dl', 'buick skylark', 'plymouth reliant',\n",
" 'subaru', 'mazda 626', 'buick century', 'pontiac phoenix',\n",
" 'honda accord'], dtype=object)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.car_name.duplicated()].car_name.unique()"
]
},
{
"cell_type": "markdown",
"id": "81b8d5a5-d323-4a70-b951-b2fe4fb1e35f",
"metadata": {},
"source": [
"There are some duplicate car names, honestly I wish there were more. If I had a bunch of data with lots of duplicate car names it'd actually be easier to predict MPG I imagine, I'll say more on this later but there are some big factors that aren't represented here."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "87715776-3634-4ca7-bbb4-e04633fe4791",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.321678Z",
"iopub.status.busy": "2022-07-21T20:29:37.321045Z",
"iopub.status.idle": "2022-07-21T20:29:37.354573Z",
"shell.execute_reply": "2022-07-21T20:29:37.353866Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.321651Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mpg</th>\n",
" <th>cylinders</th>\n",
" <th>displacement</th>\n",
" <th>horsepower</th>\n",
" <th>weight</th>\n",
" <th>acceleration</th>\n",
" <th>model_year</th>\n",
" <th>origin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>397.000000</td>\n",
" <td>397.000000</td>\n",
" <td>397.000000</td>\n",
" <td>397.000000</td>\n",
" <td>397.000000</td>\n",
" <td>397.000000</td>\n",
" <td>397.000000</td>\n",
" <td>397.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>23.514358</td>\n",
" <td>5.458438</td>\n",
" <td>193.560453</td>\n",
" <td>104.123426</td>\n",
" <td>2970.589421</td>\n",
" <td>15.571285</td>\n",
" <td>76.000000</td>\n",
" <td>1.574307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>7.825846</td>\n",
" <td>1.701577</td>\n",
" <td>104.366796</td>\n",
" <td>38.396800</td>\n",
" <td>847.903955</td>\n",
" <td>2.760431</td>\n",
" <td>3.696846</td>\n",
" <td>0.802549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>9.000000</td>\n",
" <td>3.000000</td>\n",
" <td>68.000000</td>\n",
" <td>46.000000</td>\n",
" <td>1613.000000</td>\n",
" <td>8.000000</td>\n",
" <td>70.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>17.500000</td>\n",
" <td>4.000000</td>\n",
" <td>104.000000</td>\n",
" <td>75.000000</td>\n",
" <td>2223.000000</td>\n",
" <td>13.800000</td>\n",
" <td>73.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>23.000000</td>\n",
" <td>4.000000</td>\n",
" <td>151.000000</td>\n",
" <td>92.000000</td>\n",
" <td>2800.000000</td>\n",
" <td>15.500000</td>\n",
" <td>76.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>29.000000</td>\n",
" <td>8.000000</td>\n",
" <td>262.000000</td>\n",
" <td>125.000000</td>\n",
" <td>3609.000000</td>\n",
" <td>17.200000</td>\n",
" <td>79.000000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>46.600000</td>\n",
" <td>8.000000</td>\n",
" <td>455.000000</td>\n",
" <td>230.000000</td>\n",
" <td>5140.000000</td>\n",
" <td>24.800000</td>\n",
" <td>82.000000</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mpg cylinders displacement horsepower weight \\\n",
"count 397.000000 397.000000 397.000000 397.000000 397.000000 \n",
"mean 23.514358 5.458438 193.560453 104.123426 2970.589421 \n",
"std 7.825846 1.701577 104.366796 38.396800 847.903955 \n",
"min 9.000000 3.000000 68.000000 46.000000 1613.000000 \n",
"25% 17.500000 4.000000 104.000000 75.000000 2223.000000 \n",
"50% 23.000000 4.000000 151.000000 92.000000 2800.000000 \n",
"75% 29.000000 8.000000 262.000000 125.000000 3609.000000 \n",
"max 46.600000 8.000000 455.000000 230.000000 5140.000000 \n",
"\n",
" acceleration model_year origin \n",
"count 397.000000 397.000000 397.000000 \n",
"mean 15.571285 76.000000 1.574307 \n",
"std 2.760431 3.696846 0.802549 \n",
"min 8.000000 70.000000 1.000000 \n",
"25% 13.800000 73.000000 1.000000 \n",
"50% 15.500000 76.000000 1.000000 \n",
"75% 17.200000 79.000000 2.000000 \n",
"max 24.800000 82.000000 3.000000 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"id": "90fe9344-fe47-4503-be59-74d9d38cf1d3",
"metadata": {},
"source": [
"Everything looks proportional"
]
},
{
"cell_type": "markdown",
"id": "042416c1-0e56-4269-96c8-6926392e11e7",
"metadata": {},
"source": [
"### Save"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "b3b42cca-6960-4d06-b7c4-1570f09e9fe0",
"metadata": {
"execution": {
"iopub.execute_input": "2022-07-21T20:29:37.355994Z",
"iopub.status.busy": "2022-07-21T20:29:37.355617Z",
"iopub.status.idle": "2022-07-21T20:29:37.364909Z",
"shell.execute_reply": "2022-07-21T20:29:37.364122Z",
"shell.execute_reply.started": "2022-07-21T20:29:37.355966Z"
},
"tags": []
},
"outputs": [],
"source": [
"df.to_csv('data/clean.csv', index=False)"
]
},
{
"cell_type": "markdown",
"id": "59524851-efe5-4042-8eee-d67038a13a77",
"metadata": {},
"source": [
"[EDA](eda.ipynb)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}