연습1

In [260]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [244]:

df=pd.read_csv("../data/auto-mpg.csv")

In [245]:

df.head(2)

Out[245]:

	mpg	cyl	displ	hp	weight	accel	yr	origin	name
0	18.0	8	307.0	130	3504	12.0	70	1	chevrolet chevelle malibu
1	15.0	8	350.0	165	3693	11.5	70	1	buick skylark 320

In [246]:

df.describe()

Out[246]:

	mpg	cyl	displ	hp	weight	accel	yr	origin
count	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000
mean	23.445918	5.471939	194.411990	104.469388	2977.584184	15.541327	75.979592	1.576531
std	7.805007	1.705783	104.644004	38.491160	849.402560	2.758864	3.683737	0.805518
min	9.000000	3.000000	68.000000	46.000000	1613.000000	8.000000	70.000000	1.000000
25%	17.000000	4.000000	105.000000	75.000000	2225.250000	13.775000	73.000000	1.000000
50%	22.750000	4.000000	151.000000	93.500000	2803.500000	15.500000	76.000000	1.000000
75%	29.000000	8.000000	275.750000	126.000000	3614.750000	17.025000	79.000000	2.000000
max	46.600000	8.000000	455.000000	230.000000	5140.000000	24.800000	82.000000	3.000000

In [247]:

df.isna().sum()

Out[247]:

mpg       0
cyl       0
displ     0
hp        0
weight    0
accel     0
yr        0
origin    0
name      0
dtype: int64

In [248]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mpg     392 non-null    float64
 1   cyl     392 non-null    int64  
 2   displ   392 non-null    float64
 3   hp      392 non-null    int64  
 4   weight  392 non-null    int64  
 5   accel   392 non-null    float64
 6   yr      392 non-null    int64  
 7   origin  392 non-null    int64  
 8   name    392 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 27.7+ KB

In [249]:

y = df["mpg"]

In [250]:

Out[250]:

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
387    27.0
388    44.0
389    32.0
390    28.0
391    31.0
Name: mpg, Length: 392, dtype: float64

In [251]:

X = df.iloc[:,1:-2+1 ]

In [ ]:

In [252]:

Out[252]:

	cyl	displ	hp	weight	accel	yr	origin
0	8	307.0	130	3504	12.0	70	1
1	8	350.0	165	3693	11.5	70	1
2	8	318.0	150	3436	11.0	70	1
3	8	304.0	150	3433	12.0	70	1
4	8	302.0	140	3449	10.5	70	1
...	...	...	...	...	...	...	...
387	4	140.0	86	2790	15.6	82	1
388	4	97.0	52	2130	24.6	82	2
389	4	135.0	84	2295	11.6	82	1
390	4	120.0	79	2625	18.6	82	1
391	4	119.0	82	2720	19.4	82	1

392 rows × 7 columns

In [253]:

X["cyl"].unique()

Out[253]:

array([8, 4, 6, 3, 5], dtype=int64)

In [254]:

X["yr"].unique()

Out[254]:

array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82], dtype=int64)

In [255]:

ct = ColumnTransformer( [ ("encoder", OneHotEncoder(), [6]) ], remainder = 'passthrough')

In [256]:

X = ct.fit_transform( X)

In [257]:

Out[257]:

array([[1.000e+00, 0.000e+00, 0.000e+00, ..., 3.504e+03, 1.200e+01,
        7.000e+01],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 3.693e+03, 1.150e+01,
        7.000e+01],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 3.436e+03, 1.100e+01,
        7.000e+01],
       ...,
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 2.295e+03, 1.160e+01,
        8.200e+01],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 2.625e+03, 1.860e+01,
        8.200e+01],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 2.720e+03, 1.940e+01,
        8.200e+01]])

In [258]:

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, random_state=1)

In [261]:

regressor = LinearRegression()

In [262]:

regressor.fit(X_train,y_train)

Out[262]:

LinearRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [263]:

y_pred = regressor.predict(X_test)

In [264]:

y_pred

Out[264]:

array([23.94462531, 28.64761934, 32.33077781, 21.99464755, 35.28265957,
       28.12319115, 26.45524167, 12.58160302, 25.56306456, 32.04254865,
       23.32597109, 32.29214001, 20.14095012, 31.77638576, 29.46904954,
       15.04190357, 30.0344167 , 32.62473165, 10.92503097, 22.93824236,
       27.20037833, 17.75867846, 18.10527569, 16.16985336, 10.79919676,
       20.03178998, 31.66807007, 18.42520819, 21.73299914, 29.1359896 ,
       21.92214974, 24.36825885, 10.33853798, 28.57027834, 20.81243752,
       14.53200154, 16.94944932, 21.21237735, 35.27195558, 24.73713275,
       16.12598196, 11.89879942, 22.60011043, 22.26645573, 22.1334667 ,
       17.99528974,  8.68452228, 35.73286016, 21.07296509, 10.51727695,
       15.10201083, 23.20486955, 27.21539361, 30.31799228, 34.51063284,
       20.79063836, 26.05083995, 20.90202994, 26.71031312, 33.73741146,
       28.52415583, 27.14325924, 35.16257625, 22.03416555, 22.81474093,
       20.99775758, 22.69550132, 14.0908763 , 32.84090607,  6.40461693,
       30.72823425, 20.98887219, 18.35621099, 32.14480289, 23.79497128,
       30.57166865, 20.49454935, 12.24164985, 11.67133326])

In [265]:

y_test

Out[265]:

81     23.0
165    29.0
351    32.4
119    19.0
379    38.0
       ... 
23     26.0
293    35.7
13     14.0
90     13.0
61     13.0
Name: mpg, Length: 79, dtype: float64

In [267]:

error = y_test-y_pred

In [268]:

error

Out[268]:

81    -0.944625
165    0.352381
351    0.069222
119   -2.994648
379    2.717340
         ...   
23     2.205029
293    5.128331
13    -6.494549
90     0.758350
61     1.328667
Name: mpg, Length: 79, dtype: float64

In [270]:

(error ** 2).mean()

Out[270]:

11.882268558597138

In [ ]:

In [271]:

# 새로운 자동차가 나왔습니다.
# displ 301, hp 133, cyl 6, weight 2900, accel 10.7, yr 81, origin 2(유럽차)
# 이차의 연비예측?

In [272]:

df.head(1)

Out[272]:

	mpg	cyl	displ	hp	weight	accel	yr	origin	name
0	18.0	8	307.0	130	3504	12.0	70	1	chevrolet chevelle malibu

In [286]:

new_data = np.array([6,301,133,2900,10.7,81,2])

In [287]:

new_data = new_data.reshape(1,7)

In [288]:

new_data

Out[288]:

array([[6.00e+00, 3.01e+02, 1.33e+02, 2.90e+03, 1.07e+01, 8.10e+01,
        2.00e+00]])

In [289]:

new_data=ct.transform(new_data)

C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but OneHotEncoder was fitted with feature names
  warnings.warn(

In [290]:

regressor.predict(new_data)

Out[290]:

array([30.51076092])

In [ ]:

# 인공지능과 ct 를 파일로 저장하기

In [291]:

import joblib

In [292]:

joblib.dump(regressor, 'regressor.pkl')

Out[292]:

['regressor.pkl']

In [293]:

joblib.dump(ct, 'ct.pkl')

Out[293]:

['ct.pkl']

'DataScience > MachineLearning' 카테고리의 다른 글

Machine Logistic Regression 데이터의 결점보완(0,nan), 데이터의 불균형 up sampling 기법, 결과를 히트맵으로 표현 (0)	2022.12.02
Machine [supervised{Classification(Logisticregression)}] (0)	2022.12.02
Machine Multiple Linear Regression (0)	2022.12.01
Machine [supervised{Prediction(Linear Regression)}] (0)	2022.12.01
Machine preprocessing, Feature Scaling, Dataset Training & Test (0)	2022.12.01