808 lines
25 KiB
Text
808 lines
25 KiB
Text
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "9151a000-1923-408b-bd86-16008dc95f97",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"[readme](readme.md)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "cecbac86-abb3-4f6b-a101-2d9324d96274",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Cleaning"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "b67cb510-2df0-4ce4-a033-473710fdc749",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Load file and set column names"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"id": "3c4bfade-d06d-4887-9eb4-ec7f5bc61625",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:36.887038Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:36.886672Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.222976Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.222218Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:36.886962Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"\n",
|
||
|
"df = pd.read_csv('data/auto-mpg.data',header=None,delim_whitespace=True)\n",
|
||
|
"df.columns = ['mpg','cylinders','displacement','horsepower','weight',\n",
|
||
|
" 'acceleration','model_year','origin','car_name']"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "fdcec7e3-c65e-4d66-9a10-b500fb940234",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Attribute Information:\n",
|
||
|
"\n",
|
||
|
" 1. mpg: continuous\n",
|
||
|
" 2. cylinders: multi-valued discrete\n",
|
||
|
" 3. displacement: continuous\n",
|
||
|
" 4. horsepower: continuous\n",
|
||
|
" 5. weight: continuous\n",
|
||
|
" 6. acceleration: continuous\n",
|
||
|
" 7. model year: multi-valued discrete\n",
|
||
|
" 8. origin: multi-valued discrete\n",
|
||
|
" 9. car name: string (unique for each instance)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"id": "62bbb6bd-b5b3-4d54-a132-23cd367c4570",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.225459Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.224901Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.237624Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.236773Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.225432Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
|
"RangeIndex: 398 entries, 0 to 397\n",
|
||
|
"Data columns (total 9 columns):\n",
|
||
|
" # Column Non-Null Count Dtype \n",
|
||
|
"--- ------ -------------- ----- \n",
|
||
|
" 0 mpg 398 non-null float64\n",
|
||
|
" 1 cylinders 398 non-null int64 \n",
|
||
|
" 2 displacement 398 non-null float64\n",
|
||
|
" 3 horsepower 398 non-null object \n",
|
||
|
" 4 weight 398 non-null float64\n",
|
||
|
" 5 acceleration 398 non-null float64\n",
|
||
|
" 6 model_year 398 non-null int64 \n",
|
||
|
" 7 origin 398 non-null int64 \n",
|
||
|
" 8 car_name 398 non-null object \n",
|
||
|
"dtypes: float64(4), int64(3), object(2)\n",
|
||
|
"memory usage: 28.1+ KB\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.info()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "6a4028ed-eda3-4c50-aed0-d9503d41a8e1",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Why is horsepower not a number?"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"id": "58fa2876-4ccb-4ef5-bc16-d25b74efb457",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.239126Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.238760Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.252035Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.251217Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.239098Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',\n",
|
||
|
" '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',\n",
|
||
|
" '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',\n",
|
||
|
" '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',\n",
|
||
|
" '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',\n",
|
||
|
" '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',\n",
|
||
|
" '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',\n",
|
||
|
" '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',\n",
|
||
|
" '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',\n",
|
||
|
" '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',\n",
|
||
|
" '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',\n",
|
||
|
" '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',\n",
|
||
|
" '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',\n",
|
||
|
" '116.0', '82.00'], dtype=object)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.horsepower.unique()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"id": "2d99ea58-ca51-4461-a127-c6b389b056a1",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.253416Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.253082Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.271785Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.271054Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.253389Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>mpg</th>\n",
|
||
|
" <th>cylinders</th>\n",
|
||
|
" <th>displacement</th>\n",
|
||
|
" <th>horsepower</th>\n",
|
||
|
" <th>weight</th>\n",
|
||
|
" <th>acceleration</th>\n",
|
||
|
" <th>model_year</th>\n",
|
||
|
" <th>origin</th>\n",
|
||
|
" <th>car_name</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>32</th>\n",
|
||
|
" <td>25.0</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>98.0</td>\n",
|
||
|
" <td>?</td>\n",
|
||
|
" <td>2046.0</td>\n",
|
||
|
" <td>19.0</td>\n",
|
||
|
" <td>71</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>ford pinto</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>126</th>\n",
|
||
|
" <td>21.0</td>\n",
|
||
|
" <td>6</td>\n",
|
||
|
" <td>200.0</td>\n",
|
||
|
" <td>?</td>\n",
|
||
|
" <td>2875.0</td>\n",
|
||
|
" <td>17.0</td>\n",
|
||
|
" <td>74</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>ford maverick</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>330</th>\n",
|
||
|
" <td>40.9</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>85.0</td>\n",
|
||
|
" <td>?</td>\n",
|
||
|
" <td>1835.0</td>\n",
|
||
|
" <td>17.3</td>\n",
|
||
|
" <td>80</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>renault lecar deluxe</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>336</th>\n",
|
||
|
" <td>23.6</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>140.0</td>\n",
|
||
|
" <td>?</td>\n",
|
||
|
" <td>2905.0</td>\n",
|
||
|
" <td>14.3</td>\n",
|
||
|
" <td>80</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>ford mustang cobra</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>354</th>\n",
|
||
|
" <td>34.5</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>100.0</td>\n",
|
||
|
" <td>?</td>\n",
|
||
|
" <td>2320.0</td>\n",
|
||
|
" <td>15.8</td>\n",
|
||
|
" <td>81</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>renault 18i</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>374</th>\n",
|
||
|
" <td>23.0</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>151.0</td>\n",
|
||
|
" <td>?</td>\n",
|
||
|
" <td>3035.0</td>\n",
|
||
|
" <td>20.5</td>\n",
|
||
|
" <td>82</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>amc concord dl</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" mpg cylinders displacement horsepower weight acceleration \\\n",
|
||
|
"32 25.0 4 98.0 ? 2046.0 19.0 \n",
|
||
|
"126 21.0 6 200.0 ? 2875.0 17.0 \n",
|
||
|
"330 40.9 4 85.0 ? 1835.0 17.3 \n",
|
||
|
"336 23.6 4 140.0 ? 2905.0 14.3 \n",
|
||
|
"354 34.5 4 100.0 ? 2320.0 15.8 \n",
|
||
|
"374 23.0 4 151.0 ? 3035.0 20.5 \n",
|
||
|
"\n",
|
||
|
" model_year origin car_name \n",
|
||
|
"32 71 1 ford pinto \n",
|
||
|
"126 74 1 ford maverick \n",
|
||
|
"330 80 2 renault lecar deluxe \n",
|
||
|
"336 80 1 ford mustang cobra \n",
|
||
|
"354 81 2 renault 18i \n",
|
||
|
"374 82 1 amc concord dl "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df[df.horsepower == '?']"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "498d069d-b95e-43d6-bd3d-4b707fdd9635",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"I'll fill in what I can find online"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"id": "e53a2eaf-a8f9-4d7e-bf8b-07a125cf6f06",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.273324Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.272853Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.278574Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.277496Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.273297Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# 1971 pinto kent I4\n",
|
||
|
"df.at[32,'horsepower'] = '75.0'\n",
|
||
|
"# 1974 maverick 200 I6\n",
|
||
|
"df.at[126,'horsepower'] = '85.0'\n",
|
||
|
"# 1980 renault lecar deluxe 85ci I4\n",
|
||
|
"df.at[330,'horsepower'] = '53.5'\n",
|
||
|
"# 1980 ford mustang cobra\n",
|
||
|
"# they seem confused between 2 different models\n",
|
||
|
"# 1981 renault 18i\n",
|
||
|
"df.at[354,'horsepower'] = '81.5'\n",
|
||
|
"#1982 AMC concord dl 151\n",
|
||
|
"df.at[374,'horsepower'] = '90'"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "68d959c5-9628-437f-8f3f-0b4c7002b1f0",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"We'll ignore the mustang because it's too far off from realistic, it looks like they got confused between two different models.\n",
|
||
|
"\n",
|
||
|
"Anyway, drop all '?' horsepower"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"id": "10400330-e6aa-43e0-910f-f97869c23d0f",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.280095Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.279777Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.286985Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.286202Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.280060Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df.drop(df[df.horsepower == '?'].index,inplace=True)\n",
|
||
|
"df['horsepower'] = df.horsepower.astype(float)\n",
|
||
|
"df.reset_index(inplace=True,drop=True)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "b2afc76d-c428-4b81-9882-5ea19ecd04bb",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"And set to floats, like the rest"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"id": "e0fd9a7b-6cdf-4346-8c8d-6c5f36e167f6",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.289881Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.289472Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.301335Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.300537Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.289852Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
|
"RangeIndex: 397 entries, 0 to 396\n",
|
||
|
"Data columns (total 9 columns):\n",
|
||
|
" # Column Non-Null Count Dtype \n",
|
||
|
"--- ------ -------------- ----- \n",
|
||
|
" 0 mpg 397 non-null float64\n",
|
||
|
" 1 cylinders 397 non-null int64 \n",
|
||
|
" 2 displacement 397 non-null float64\n",
|
||
|
" 3 horsepower 397 non-null float64\n",
|
||
|
" 4 weight 397 non-null float64\n",
|
||
|
" 5 acceleration 397 non-null float64\n",
|
||
|
" 6 model_year 397 non-null int64 \n",
|
||
|
" 7 origin 397 non-null int64 \n",
|
||
|
" 8 car_name 397 non-null object \n",
|
||
|
"dtypes: float64(5), int64(3), object(1)\n",
|
||
|
"memory usage: 28.0+ KB\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.info()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "097c4cec-eb77-46f6-8eef-89eb7c47b425",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Looks good"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "151e5f1b-6409-4972-9c79-a26d132eedf5",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"### Min/Max to check range"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"id": "769f33e7-2f2e-46e8-b6dd-8f8fb79d13b7",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.303380Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.302680Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.310508Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.309738Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.303336Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"mpg\n",
|
||
|
"Min: 9.0 \n",
|
||
|
"Max: 46.6\n",
|
||
|
"\n",
|
||
|
"cylinders\n",
|
||
|
"Min: 3 \n",
|
||
|
"Max: 8\n",
|
||
|
"\n",
|
||
|
"displacement\n",
|
||
|
"Min: 68.0 \n",
|
||
|
"Max: 455.0\n",
|
||
|
"\n",
|
||
|
"horsepower\n",
|
||
|
"Min: 46.0 \n",
|
||
|
"Max: 230.0\n",
|
||
|
"\n",
|
||
|
"weight\n",
|
||
|
"Min: 1613.0 \n",
|
||
|
"Max: 5140.0\n",
|
||
|
"\n",
|
||
|
"acceleration\n",
|
||
|
"Min: 8.0 \n",
|
||
|
"Max: 24.8\n",
|
||
|
"\n",
|
||
|
"model_year\n",
|
||
|
"Min: 70 \n",
|
||
|
"Max: 82\n",
|
||
|
"\n",
|
||
|
"origin\n",
|
||
|
"Min: 1 \n",
|
||
|
"Max: 3\n",
|
||
|
"\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"for col in df.columns[:-1]:\n",
|
||
|
" print(f'''{col}\n",
|
||
|
"Min: {df[col].min()} \n",
|
||
|
"Max: {df[col].max()}\n",
|
||
|
"''')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "59641984-e266-4eaa-a90d-af266cb95936",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"All of this makes sense"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"id": "7bac1a71-53d2-4081-b566-244bccd3a3c6",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.312020Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.311631Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.319881Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.318953Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.311992Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array(['datsun pl510', 'amc gremlin', 'chevrolet chevelle malibu',\n",
|
||
|
" 'chevrolet impala', 'ford galaxie 500', 'plymouth fury iii',\n",
|
||
|
" 'pontiac catalina', 'amc matador', 'amc hornet', 'ford maverick',\n",
|
||
|
" 'plymouth duster', 'chevrolet vega', 'ford pinto',\n",
|
||
|
" 'toyota corolla 1200', 'ford gran torino', 'ford gran torino (sw)',\n",
|
||
|
" 'amc matador (sw)', 'opel manta', 'toyota corona', 'fiat 128',\n",
|
||
|
" 'chevrolet nova', 'ford ltd', 'volkswagen dasher', 'datsun 710',\n",
|
||
|
" 'audi 100ls', 'peugeot 504', 'saab 99le', 'opel 1900',\n",
|
||
|
" 'dodge colt', 'chevrolet chevelle malibu classic',\n",
|
||
|
" 'plymouth valiant', 'honda civic', 'volkswagen rabbit',\n",
|
||
|
" 'toyota corolla', 'toyota mark ii', 'chevrolet caprice classic',\n",
|
||
|
" 'chevrolet chevette', 'honda civic cvcc', 'chevrolet malibu',\n",
|
||
|
" 'chevrolet monte carlo landau', 'buick estate wagon (sw)',\n",
|
||
|
" 'ford country squire (sw)', 'oldsmobile cutlass salon brougham',\n",
|
||
|
" 'vw rabbit', 'chevrolet citation', 'amc concord', 'dodge aspen',\n",
|
||
|
" 'datsun 210', 'subaru dl', 'buick skylark', 'plymouth reliant',\n",
|
||
|
" 'subaru', 'mazda 626', 'buick century', 'pontiac phoenix',\n",
|
||
|
" 'honda accord'], dtype=object)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 9,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df[df.car_name.duplicated()].car_name.unique()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "81b8d5a5-d323-4a70-b951-b2fe4fb1e35f",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"There are some duplicate car names, honestly I wish there were more. If I had a bunch of data with lots of duplicate car names it'd actually be easier to predict MPG I imagine, I'll say more on this later but there are some big factors that aren't represented here."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"id": "87715776-3634-4ca7-bbb4-e04633fe4791",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.321678Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.321045Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.354573Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.353866Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.321651Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>mpg</th>\n",
|
||
|
" <th>cylinders</th>\n",
|
||
|
" <th>displacement</th>\n",
|
||
|
" <th>horsepower</th>\n",
|
||
|
" <th>weight</th>\n",
|
||
|
" <th>acceleration</th>\n",
|
||
|
" <th>model_year</th>\n",
|
||
|
" <th>origin</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>count</th>\n",
|
||
|
" <td>397.000000</td>\n",
|
||
|
" <td>397.000000</td>\n",
|
||
|
" <td>397.000000</td>\n",
|
||
|
" <td>397.000000</td>\n",
|
||
|
" <td>397.000000</td>\n",
|
||
|
" <td>397.000000</td>\n",
|
||
|
" <td>397.000000</td>\n",
|
||
|
" <td>397.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>mean</th>\n",
|
||
|
" <td>23.514358</td>\n",
|
||
|
" <td>5.458438</td>\n",
|
||
|
" <td>193.560453</td>\n",
|
||
|
" <td>104.123426</td>\n",
|
||
|
" <td>2970.589421</td>\n",
|
||
|
" <td>15.571285</td>\n",
|
||
|
" <td>76.000000</td>\n",
|
||
|
" <td>1.574307</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>std</th>\n",
|
||
|
" <td>7.825846</td>\n",
|
||
|
" <td>1.701577</td>\n",
|
||
|
" <td>104.366796</td>\n",
|
||
|
" <td>38.396800</td>\n",
|
||
|
" <td>847.903955</td>\n",
|
||
|
" <td>2.760431</td>\n",
|
||
|
" <td>3.696846</td>\n",
|
||
|
" <td>0.802549</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>min</th>\n",
|
||
|
" <td>9.000000</td>\n",
|
||
|
" <td>3.000000</td>\n",
|
||
|
" <td>68.000000</td>\n",
|
||
|
" <td>46.000000</td>\n",
|
||
|
" <td>1613.000000</td>\n",
|
||
|
" <td>8.000000</td>\n",
|
||
|
" <td>70.000000</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>25%</th>\n",
|
||
|
" <td>17.500000</td>\n",
|
||
|
" <td>4.000000</td>\n",
|
||
|
" <td>104.000000</td>\n",
|
||
|
" <td>75.000000</td>\n",
|
||
|
" <td>2223.000000</td>\n",
|
||
|
" <td>13.800000</td>\n",
|
||
|
" <td>73.000000</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>50%</th>\n",
|
||
|
" <td>23.000000</td>\n",
|
||
|
" <td>4.000000</td>\n",
|
||
|
" <td>151.000000</td>\n",
|
||
|
" <td>92.000000</td>\n",
|
||
|
" <td>2800.000000</td>\n",
|
||
|
" <td>15.500000</td>\n",
|
||
|
" <td>76.000000</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>75%</th>\n",
|
||
|
" <td>29.000000</td>\n",
|
||
|
" <td>8.000000</td>\n",
|
||
|
" <td>262.000000</td>\n",
|
||
|
" <td>125.000000</td>\n",
|
||
|
" <td>3609.000000</td>\n",
|
||
|
" <td>17.200000</td>\n",
|
||
|
" <td>79.000000</td>\n",
|
||
|
" <td>2.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>max</th>\n",
|
||
|
" <td>46.600000</td>\n",
|
||
|
" <td>8.000000</td>\n",
|
||
|
" <td>455.000000</td>\n",
|
||
|
" <td>230.000000</td>\n",
|
||
|
" <td>5140.000000</td>\n",
|
||
|
" <td>24.800000</td>\n",
|
||
|
" <td>82.000000</td>\n",
|
||
|
" <td>3.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" mpg cylinders displacement horsepower weight \\\n",
|
||
|
"count 397.000000 397.000000 397.000000 397.000000 397.000000 \n",
|
||
|
"mean 23.514358 5.458438 193.560453 104.123426 2970.589421 \n",
|
||
|
"std 7.825846 1.701577 104.366796 38.396800 847.903955 \n",
|
||
|
"min 9.000000 3.000000 68.000000 46.000000 1613.000000 \n",
|
||
|
"25% 17.500000 4.000000 104.000000 75.000000 2223.000000 \n",
|
||
|
"50% 23.000000 4.000000 151.000000 92.000000 2800.000000 \n",
|
||
|
"75% 29.000000 8.000000 262.000000 125.000000 3609.000000 \n",
|
||
|
"max 46.600000 8.000000 455.000000 230.000000 5140.000000 \n",
|
||
|
"\n",
|
||
|
" acceleration model_year origin \n",
|
||
|
"count 397.000000 397.000000 397.000000 \n",
|
||
|
"mean 15.571285 76.000000 1.574307 \n",
|
||
|
"std 2.760431 3.696846 0.802549 \n",
|
||
|
"min 8.000000 70.000000 1.000000 \n",
|
||
|
"25% 13.800000 73.000000 1.000000 \n",
|
||
|
"50% 15.500000 76.000000 1.000000 \n",
|
||
|
"75% 17.200000 79.000000 2.000000 \n",
|
||
|
"max 24.800000 82.000000 3.000000 "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 10,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.describe()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "90fe9344-fe47-4503-be59-74d9d38cf1d3",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Everything looks proportional"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "042416c1-0e56-4269-96c8-6926392e11e7",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"### Save"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 11,
|
||
|
"id": "b3b42cca-6960-4d06-b7c4-1570f09e9fe0",
|
||
|
"metadata": {
|
||
|
"execution": {
|
||
|
"iopub.execute_input": "2022-07-21T20:29:37.355994Z",
|
||
|
"iopub.status.busy": "2022-07-21T20:29:37.355617Z",
|
||
|
"iopub.status.idle": "2022-07-21T20:29:37.364909Z",
|
||
|
"shell.execute_reply": "2022-07-21T20:29:37.364122Z",
|
||
|
"shell.execute_reply.started": "2022-07-21T20:29:37.355966Z"
|
||
|
},
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df.to_csv('data/clean.csv', index=False)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "59524851-efe5-4042-8eee-d67038a13a77",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"[EDA](eda.ipynb)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3 (ipykernel)",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.10.5"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|