In [ ]:
# 50 Startups csv 파일을 가지고, 딥러닝 이용해서 학습하고, 평가까지 해보세요.
In [1]:
import numpy as np
import pandas as pd
In [2]:
import os
In [ ]:
In [3]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [4]:
pwd
Out[4]:
'/content'
In [5]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/ml_plus')
In [7]:
pwd
Out[7]:
'/content/drive/MyDrive/Colab Notebooks/ml_plus'
In [8]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
In [10]:
df = pd.read_csv('data/50_Startups.csv')
In [11]:
df.head(3)
Out[11]:
R&D Spend | Administration | Marketing Spend | State | Profit | |
---|---|---|---|---|---|
0 | 165349.20 | 136897.80 | 471784.10 | New York | 192261.83 |
1 | 162597.70 | 151377.59 | 443898.53 | California | 191792.06 |
2 | 153441.51 | 101145.55 | 407934.54 | Florida | 191050.39 |
In [ ]:
In [12]:
df.isna().sum()
Out[12]:
R&D Spend 0 Administration 0 Marketing Spend 0 State 0 Profit 0 dtype: int64
In [13]:
df.describe()
Out[13]:
R&D Spend | Administration | Marketing Spend | Profit | |
---|---|---|---|---|
count | 50.000000 | 50.000000 | 50.000000 | 50.000000 |
mean | 73721.615600 | 121344.639600 | 211025.097800 | 112012.639200 |
std | 45902.256482 | 28017.802755 | 122290.310726 | 40306.180338 |
min | 0.000000 | 51283.140000 | 0.000000 | 14681.400000 |
25% | 39936.370000 | 103730.875000 | 129300.132500 | 90138.902500 |
50% | 73051.080000 | 122699.795000 | 212716.240000 | 107978.190000 |
75% | 101602.800000 | 144842.180000 | 299469.085000 | 139765.977500 |
max | 165349.200000 | 182645.560000 | 471784.100000 | 192261.830000 |
In [64]:
import matplotlib.pyplot as plt
In [118]:
# 상관관계를 그려본다.
import seaborn as sb
sb.pairplot(data = df)
plt.show()
In [ ]:
# 수치 예측의 문제 리니어 리그레션
In [14]:
df['State'].nunique()
Out[14]:
3
In [15]:
df['State'].unique()
Out[15]:
array(['New York', 'California', 'Florida'], dtype=object)
In [ ]:
# X와 y로 나눈다
In [40]:
df.head(1)
Out[40]:
R&D Spend | Administration | Marketing Spend | State | Profit | |
---|---|---|---|---|---|
0 | 165349.2 | 136897.8 | 471784.1 | New York | 192261.83 |
In [41]:
df.columns
Out[41]:
Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')
In [43]:
X = df.loc[:,['R&D Spend', 'Administration', 'Marketing Spend', 'State' ] ]
In [44]:
y = df['Profit']
In [ ]:
# 인코딩 진행
In [16]:
from sklearn.preprocessing import OneHotEncoder
In [36]:
from sklearn.compose import ColumnTransformer
In [37]:
ct = ColumnTransformer( [ ('encoder', OneHotEncoder(), [3] ) ],
remainder='passthrough')
In [46]:
X = ct.fit_transform(X.values)
In [ ]:
# 첫째행은 날려도 state 를 구분할수있기때문에 넘파이 슬라이싱으로 삭제
In [56]:
X = X[ : , 1 : ]
In [ ]:
In [ ]:
# 표준화 진행
In [47]:
from sklearn.preprocessing import MinMaxScaler
In [50]:
scaler_X= MinMaxScaler()
In [51]:
scaler_y= MinMaxScaler()
In [57]:
X = scaler_X.fit_transform(X)
In [58]:
X
Out[58]:
array([[0. , 1. , 1. , 0.65174393, 1. ], [0. , 0. , 0.98335946, 0.76197173, 0.94089337], [1. , 0. , 0.92798459, 0.37957895, 0.8646636 ], [0. , 1. , 0.87313643, 0.51299839, 0.81223513], [1. , 0. , 0.85943772, 0.30532804, 0.77613557], [0. , 1. , 0.797566 , 0.3694479 , 0.76912588], [0. , 0. , 0.81412828, 0.73016111, 0.27071031], [1. , 0. , 0.7880179 , 0.71745725, 0.68649342], [0. , 1. , 0.72901786, 0.74173276, 0.66049977], [0. , 0. , 0.74590551, 0.43692884, 0.64644319], [1. , 0. , 0.61635061, 0.45150637, 0.48573267], [0. , 0. , 0.60884455, 0.30836422, 0.52936195], [1. , 0. , 0.56766982, 0.57883556, 0.52956308], [0. , 0. , 0.55635219, 0.64106561, 0.53555202], [1. , 0. , 0.72539353, 0.8013272 , 0.54370828], [0. , 1. , 0.69261666, 0.54302973, 0.55486446], [0. , 0. , 0.47180821, 0.53527036, 0.56031151], [0. , 1. , 0.57246821, 0.71401273, 0.59894835], [1. , 0. , 0.55488118, 0.47877201, 0.62511553], [0. , 1. , 0.52264964, 0.77823604, 0. ], [0. , 0. , 0.46116861, 0.47642362, 0.63305328], [0. , 1. , 0.47408436, 0.78021012, 0.63532724], [1. , 0. , 0.4475048 , 0.54429273, 0.64291963], [1. , 0. , 0.40842369, 0.4146383 , 0.64599195], [0. , 1. , 0.46594728, 0.3653876 , 0.29796428], [0. , 0. , 0.39107967, 0.67195793, 0.29242745], [1. , 0. , 0.45557444, 0.70684477, 0.28413435], [0. , 1. , 0.43609283, 0.58297807, 0.74861321], [1. , 0. , 0.39946683, 1. , 0.25042853], [0. , 1. , 0.39676926, 0.77456642, 0.22709197], [1. , 0. , 0.37493063, 0.48992809, 0.19316302], [0. , 1. , 0.36974101, 0.77205322, 0.18698856], [0. , 0. , 0.38348453, 0.5932935 , 0.09768292], [1. , 0. , 0.33561668, 0.39413365, 0.45494286], [0. , 0. , 0.2807759 , 0.81005496, 0.44680961], [0. , 1. , 0.2782839 , 0.25703165, 0.43561799], [1. , 0. , 0.17335288, 0.57682456, 0.42631115], [0. , 0. , 0.26652654, 0. , 0.41762624], [0. , 1. , 0.12234465, 0.11163611, 0.39269043], [0. , 0. , 0.23319442, 0.24130912, 0.3709309 ], [0. , 0. , 0.17390063, 0.51204073, 0.36626005], [1. , 0. , 0.16869099, 0.25446874, 0.34861436], [0. , 0. , 0.14297577, 0.34185188, 0.31370517], [0. , 1. , 0.09377566, 0.57930693, 0.07531871], [0. , 0. , 0.13412668, 0.78807166, 0.06005866], [0. , 1. , 0.0060492 , 0.5547241 , 0.0040356 ], [1. , 0. , 0.00795565, 0.49125975, 0.62976785], [0. , 0. , 0. , 0.64054682, 0. ], [0. , 1. , 0.00327821, 0.00350184, 0. ], [0. , 0. , 0. , 0.50014806, 0.09574943]])
In [65]:
y.values.reshape(50,1)
Out[65]:
array([[192261.83], [191792.06], [191050.39], [182901.99], [166187.94], [156991.12], [156122.51], [155752.6 ], [152211.77], [149759.96], [146121.95], [144259.4 ], [141585.52], [134307.35], [132602.65], [129917.04], [126992.93], [125370.37], [124266.9 ], [122776.86], [118474.03], [111313.02], [110352.25], [108733.99], [108552.04], [107404.34], [105733.54], [105008.31], [103282.38], [101004.64], [ 99937.59], [ 97483.56], [ 97427.84], [ 96778.92], [ 96712.8 ], [ 96479.51], [ 90708.19], [ 89949.14], [ 81229.06], [ 81005.76], [ 78239.91], [ 77798.83], [ 71498.49], [ 69758.98], [ 65200.33], [ 64926.08], [ 49490.75], [ 42559.73], [ 35673.41], [ 14681.4 ]])
In [66]:
y = scaler_y.fit_transform(y.values.reshape(50,1))
In [67]:
y
Out[67]:
array([[1. ], [0.99735461], [0.99317808], [0.94729239], [0.85317138], [0.80138177], [0.79649041], [0.79440736], [0.77446805], [0.7606613 ], [0.74017475], [0.72968626], [0.71462897], [0.67364377], [0.66404417], [0.64892083], [0.63245443], [0.62331739], [0.61710347], [0.60871268], [0.58448237], [0.54415692], [0.53874658], [0.52963376], [0.52860915], [0.52214616], [0.51273747], [0.50865352], [0.49893437], [0.48610784], [0.48009902], [0.46627976], [0.46596599], [0.46231175], [0.46193942], [0.4606257 ], [0.42812595], [0.42385155], [0.37474659], [0.37348913], [0.35791393], [0.3554301 ], [0.3199513 ], [0.31015569], [0.28448478], [0.28294041], [0.19602019], [0.15698988], [0.11821128], [0. ]])
In [68]:
from sklearn.model_selection import train_test_split
In [71]:
X_train.shape
Out[71]:
(40, 5)
In [69]:
X_train,X_test,y_train, y_test = train_test_split(X,y_scale, test_size=0.2,random_state=50)
In [ ]:
# ANN 모델링
In [79]:
from keras.engine import input_layer
def build_model() :
model = Sequential()
model.add( Dense( units= 10, activation='relu',input_shape = (5,)))
model.add( Dense( units= 10, activation='relu'))
model.add( Dense( units= 1, activation='linear'))
model.compile('adam','mean_squared_error')
return model
In [87]:
model = build_model()
In [81]:
model.summary()
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_6 (Dense) (None, 10) 60 dense_7 (Dense) (None, 10) 110 dense_8 (Dense) (None, 1) 11 ================================================================= Total params: 181 Trainable params: 181 Non-trainable params: 0 _________________________________________________________________
In [89]:
epoch_history = model.fit(X_train, y_train, batch_size = 10, epochs= 30)
Epoch 1/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0370 Epoch 2/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0344 Epoch 3/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0320 Epoch 4/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0299 Epoch 5/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0277 Epoch 6/30 4/4 [==============================] - 0s 6ms/step - loss: 0.0258 Epoch 7/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0240 Epoch 8/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0221 Epoch 9/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0206 Epoch 10/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0192 Epoch 11/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0178 Epoch 12/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0168 Epoch 13/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0159 Epoch 14/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0150 Epoch 15/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0140 Epoch 16/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0134 Epoch 17/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0126 Epoch 18/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0121 Epoch 19/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0116 Epoch 20/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0112 Epoch 21/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0106 Epoch 22/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0103 Epoch 23/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0099 Epoch 24/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0096 Epoch 25/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0094 Epoch 26/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0091 Epoch 27/30 4/4 [==============================] - 0s 5ms/step - loss: 0.0089 Epoch 28/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0086 Epoch 29/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0084 Epoch 30/30 4/4 [==============================] - 0s 4ms/step - loss: 0.0082
In [ ]:
In [90]:
# 로스를 그래프로 확인하기위해 epoch_history로 저장을했다.
import matplotlib.pyplot as plt
In [91]:
epoch_history.history # 딕셔너리형태
Out[91]:
{'loss': [0.037024714052677155, 0.03440560773015022, 0.032001279294490814, 0.029904764145612717, 0.02772681973874569, 0.025789299979805946, 0.023978762328624725, 0.022145789116621017, 0.020642820745706558, 0.019176553934812546, 0.01783348061144352, 0.016819536685943604, 0.015867438167333603, 0.014964175410568714, 0.014031457714736462, 0.013441607356071472, 0.012639869935810566, 0.012070409022271633, 0.011620474979281425, 0.0111702810972929, 0.010649261996150017, 0.010288791730999947, 0.009946207515895367, 0.00959742534905672, 0.009423546493053436, 0.00909239612519741, 0.008856101892888546, 0.00862288661301136, 0.008418883197009563, 0.008232207968831062]}
In [93]:
epoch_history.history['loss'] # loss를 키로 값만확인
Out[93]:
[0.037024714052677155, 0.03440560773015022, 0.032001279294490814, 0.029904764145612717, 0.02772681973874569, 0.025789299979805946, 0.023978762328624725, 0.022145789116621017, 0.020642820745706558, 0.019176553934812546, 0.01783348061144352, 0.016819536685943604, 0.015867438167333603, 0.014964175410568714, 0.014031457714736462, 0.013441607356071472, 0.012639869935810566, 0.012070409022271633, 0.011620474979281425, 0.0111702810972929, 0.010649261996150017, 0.010288791730999947, 0.009946207515895367, 0.00959742534905672, 0.009423546493053436, 0.00909239612519741, 0.008856101892888546, 0.00862288661301136, 0.008418883197009563, 0.008232207968831062]
In [99]:
# 경사를 눈으로 확인 (오차가 더 떨어질지 보는것)
plt.plot( np.arange(1,30+1 ), epoch_history.history['loss'])
plt.xlabel('# epochs')
plt.ylabel('Loss')
plt.show()
In [ ]:
# 검증은 필수.
In [102]:
model.evaluate(X_test,y_test)
1/1 [==============================] - 1s 773ms/step - loss: 0.0026
Out[102]:
0.00256070913746953
In [ ]:
# 새로운 신규 회사가 있는데, 플로리다에 있고, 운영비는 2십만 달라,
# 연구개발비는 25만달라, 마케팅비는 38만달러를 쓰고 있다.
# 이회사의 수익을 예측하시오.
In [104]:
df.head(1)
Out[104]:
R&D Spend | Administration | Marketing Spend | State | Profit | |
---|---|---|---|---|---|
0 | 165349.2 | 136897.8 | 471784.1 | New York | 192261.83 |
In [106]:
X[0]
Out[106]:
array([0. , 1. , 1. , 0.65174393, 1. ])
In [107]:
new_data = np.array([250000, 200000, 380000, 'Florida'])
In [108]:
# 2차원으로 변경
new_data = new_data.reshape(1,4)
In [110]:
new_data = ct.transform(new_data)
In [111]:
new_data
Out[111]:
array([['0.0', '1.0', '0.0', '250000', '200000', '380000']], dtype='<U32')
In [112]:
new_data = new_data[ : , 1 : ]
In [113]:
new_data
Out[113]:
array([['1.0', '0.0', '250000', '200000', '380000']], dtype='<U32')
In [114]:
new_data = scaler_X.transform(new_data)
In [115]:
y_pred = model.predict(new_data)
1/1 [==============================] - 0s 90ms/step
In [116]:
y_pred
Out[116]:
array([[1.2161369]], dtype=float32)
In [117]:
scaler_y.inverse_transform(y_pred)
Out[117]:
array([[230643.52]], dtype=float32)
In [ ]:
In [ ]:
'DataScience > TensorFlow[ANN]' 카테고리의 다른 글
딥러닝 텐서플로우 10개로 분류된 패션이미지 분류 예시, softmax, np.argmax(axis=1), overfitting, callbacks (0) | 2022.12.29 |
---|---|
딥러닝 텐서플로우 leaning rate를 옵티마이저에서 셋팅, 밸리데이션 데이터란 무엇이고 사용법,EarlyStopping 라이브러리 사용법 (0) | 2022.12.28 |
딥러닝 텐서플로우 리그레션(regression) 문제 모델링 하는 방법 (0) | 2022.12.28 |
딥러닝 텐서플로우 분류의문제 GridSearch (저번편에 빈공간에 이어서..) (0) | 2022.12.27 |
딥러닝 ANN개념 정리 요약 (0) | 2022.12.27 |