DataScience/MachineLearning
Machine Logistic Regression 데이터의 결점보완(0,nan), 데이터의 불균형 up sampling 기법, 결과를 히트맵으로 표현
leopard4
2022. 12. 2. 12:02

In [1]:
import pandas as pd
In [3]:
df = pd.read_csv("../data/pima-indians-diabetes.csv")
In [28]:
df.describe() # 데이터값이 이상한 부분이있다.
Out[28]:
Preg | Plas | Pres | skin | test | mass | pedi | age | class | |
---|---|---|---|---|---|---|---|---|---|
count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
In [4]:
df
Out[4]:
Preg | Plas | Pres | skin | test | mass | pedi | age | class | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
768 rows × 9 columns
In [5]:
df.isna().sum()
Out[5]:
Preg 0 Plas 0 Pres 0 skin 0 test 0 mass 0 pedi 0 age 0 class 0 dtype: int64
In [ ]:
# 데이터가 없어서 0으로 세팅하는경우가 실제로 많이 있다.
# 그래서, 0으로 셋팅된 값을, Nan으로 바꿔준다.
In [30]:
import numpy as np
In [31]:
df.loc[ :, "Plas":"age"].replace(0, np.nan)
Out[31]:
Plas | Pres | skin | test | mass | pedi | age | |
---|---|---|---|---|---|---|---|
0 | 148.0 | 72.0 | 35.0 | NaN | 33.6 | 0.627 | 50 |
1 | 85.0 | 66.0 | 29.0 | NaN | 26.6 | 0.351 | 31 |
2 | 183.0 | 64.0 | NaN | NaN | 23.3 | 0.672 | 32 |
3 | 89.0 | 66.0 | 23.0 | 94.0 | 28.1 | 0.167 | 21 |
4 | 137.0 | 40.0 | 35.0 | 168.0 | 43.1 | 2.288 | 33 |
... | ... | ... | ... | ... | ... | ... | ... |
763 | 101.0 | 76.0 | 48.0 | 180.0 | 32.9 | 0.171 | 63 |
764 | 122.0 | 70.0 | 27.0 | NaN | 36.8 | 0.340 | 27 |
765 | 121.0 | 72.0 | 23.0 | 112.0 | 26.2 | 0.245 | 30 |
766 | 126.0 | 60.0 | NaN | NaN | 30.1 | 0.349 | 47 |
767 | 93.0 | 70.0 | 31.0 | NaN | 30.4 | 0.315 | 23 |
768 rows × 7 columns
In [32]:
df.loc[ :, "Plas":"age"] = df.loc[ :, "Plas":"age"].replace(0, np.nan)
In [35]:
df.isna().sum()
Out[35]:
Preg 0 Plas 5 Pres 35 skin 227 test 374 mass 11 pedi 0 age 0 class 0 dtype: int64
In [36]:
# 1. nan 을 없애고, 인공지능 개발 (가장 best 이긴하다.)
In [37]:
# 2. 다른값으로 채우는 방법, 여기선 각 열의 평균으로 채웠다.
In [39]:
df.mean()
Out[39]:
Preg 3.845052 Plas 121.686763 Pres 72.405184 skin 29.153420 test 155.548223 mass 32.457464 pedi 0.471876 age 33.240885 class 0.348958 dtype: float64
In [40]:
df.fillna( df.mean(), inplace=True)
In [42]:
df
Out[42]:
Preg | Plas | Pres | skin | test | mass | pedi | age | class | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148.0 | 72.0 | 35.00000 | 155.548223 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85.0 | 66.0 | 29.00000 | 155.548223 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183.0 | 64.0 | 29.15342 | 155.548223 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89.0 | 66.0 | 23.00000 | 94.000000 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137.0 | 40.0 | 35.00000 | 168.000000 | 43.1 | 2.288 | 33 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | 10 | 101.0 | 76.0 | 48.00000 | 180.000000 | 32.9 | 0.171 | 63 | 0 |
764 | 2 | 122.0 | 70.0 | 27.00000 | 155.548223 | 36.8 | 0.340 | 27 | 0 |
765 | 5 | 121.0 | 72.0 | 23.00000 | 112.000000 | 26.2 | 0.245 | 30 | 0 |
766 | 1 | 126.0 | 60.0 | 29.15342 | 155.548223 | 30.1 | 0.349 | 47 | 1 |
767 | 1 | 93.0 | 70.0 | 31.00000 | 155.548223 | 30.4 | 0.315 | 23 | 0 |
768 rows × 9 columns
In [43]:
X = df.loc[ : , "Preg":"age"]
In [44]:
y = df["class"]
In [45]:
X
Out[45]:
Preg | Plas | Pres | skin | test | mass | pedi | age | |
---|---|---|---|---|---|---|---|---|
0 | 6 | 148.0 | 72.0 | 35.00000 | 155.548223 | 33.6 | 0.627 | 50 |
1 | 1 | 85.0 | 66.0 | 29.00000 | 155.548223 | 26.6 | 0.351 | 31 |
2 | 8 | 183.0 | 64.0 | 29.15342 | 155.548223 | 23.3 | 0.672 | 32 |
3 | 1 | 89.0 | 66.0 | 23.00000 | 94.000000 | 28.1 | 0.167 | 21 |
4 | 0 | 137.0 | 40.0 | 35.00000 | 168.000000 | 43.1 | 2.288 | 33 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | 10 | 101.0 | 76.0 | 48.00000 | 180.000000 | 32.9 | 0.171 | 63 |
764 | 2 | 122.0 | 70.0 | 27.00000 | 155.548223 | 36.8 | 0.340 | 27 |
765 | 5 | 121.0 | 72.0 | 23.00000 | 112.000000 | 26.2 | 0.245 | 30 |
766 | 1 | 126.0 | 60.0 | 29.15342 | 155.548223 | 30.1 | 0.349 | 47 |
767 | 1 | 93.0 | 70.0 | 31.00000 | 155.548223 | 30.4 | 0.315 | 23 |
768 rows × 8 columns
In [46]:
y
Out[46]:
0 1 1 0 2 1 3 0 4 1 .. 763 0 764 0 765 0 766 1 767 0 Name: class, Length: 768, dtype: int64
In [48]:
y_val = y.value_counts()
In [50]:
y_val
Out[50]:
0 500 1 268 Name: class, dtype: int64
In [66]:
import matplotlib.pyplot as plt
import seaborn as sb
In [67]:
plt.pie(y_val , labels= y_val.index , autopct= "%.0f" , startangle= 90,
wedgeprops= {"width" : 0.8})
plt.legend([0, 1])
plt.show()
In [71]:
# 데이터의 불균형
sb.countplot(data = df, x="class")
plt.show()
In [72]:
# 당뇨병인 사람의 데이터가 훨씬 적으니까,
# up sampling 기법으로, 당뇨병 데이터를 늘린다.
In [ ]:
# imblearn 라이브러리를 이용하기 위해서 설치.
In [74]:
from imblearn.over_sampling import SMOTE
In [78]:
sm = SMOTE(random_state=2) # 데이터셋을 2로 맞춘다.
In [79]:
sm.fit_resample(X,y)
Out[79]:
( Preg Plas Pres skin test mass pedi \ 0 6 148.000000 72.000000 35.000000 155.548223 33.600000 0.627000 1 1 85.000000 66.000000 29.000000 155.548223 26.600000 0.351000 2 8 183.000000 64.000000 29.153420 155.548223 23.300000 0.672000 3 1 89.000000 66.000000 23.000000 94.000000 28.100000 0.167000 4 0 137.000000 40.000000 35.000000 168.000000 43.100000 2.288000 .. ... ... ... ... ... ... ... 995 4 112.068692 72.931308 34.396961 104.793923 31.326706 0.583229 996 9 123.496400 57.282629 30.617023 155.548223 34.988795 0.970132 997 8 149.874944 82.500045 45.750067 238.499955 36.475033 0.972876 998 5 97.007387 84.532841 30.670194 155.548223 31.629886 0.785759 999 0 97.323977 80.545710 22.676023 38.323977 34.591861 0.330470 age 0 50 1 31 2 32 3 21 4 33 .. ... 995 31 996 34 997 46 998 58 999 23 [1000 rows x 8 columns], 0 1 1 0 2 1 3 0 4 1 .. 995 1 996 1 997 1 998 1 999 1 Name: class, Length: 1000, dtype: int64)
In [80]:
X, y = sm.fit_resample(X,y)
In [81]:
X
Out[81]:
Preg | Plas | Pres | skin | test | mass | pedi | age | |
---|---|---|---|---|---|---|---|---|
0 | 6 | 148.000000 | 72.000000 | 35.000000 | 155.548223 | 33.600000 | 0.627000 | 50 |
1 | 1 | 85.000000 | 66.000000 | 29.000000 | 155.548223 | 26.600000 | 0.351000 | 31 |
2 | 8 | 183.000000 | 64.000000 | 29.153420 | 155.548223 | 23.300000 | 0.672000 | 32 |
3 | 1 | 89.000000 | 66.000000 | 23.000000 | 94.000000 | 28.100000 | 0.167000 | 21 |
4 | 0 | 137.000000 | 40.000000 | 35.000000 | 168.000000 | 43.100000 | 2.288000 | 33 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | 4 | 112.068692 | 72.931308 | 34.396961 | 104.793923 | 31.326706 | 0.583229 | 31 |
996 | 9 | 123.496400 | 57.282629 | 30.617023 | 155.548223 | 34.988795 | 0.970132 | 34 |
997 | 8 | 149.874944 | 82.500045 | 45.750067 | 238.499955 | 36.475033 | 0.972876 | 46 |
998 | 5 | 97.007387 | 84.532841 | 30.670194 | 155.548223 | 31.629886 | 0.785759 | 58 |
999 | 0 | 97.323977 | 80.545710 | 22.676023 | 38.323977 | 34.591861 | 0.330470 | 23 |
1000 rows × 8 columns
In [82]:
y
Out[82]:
0 1 1 0 2 1 3 0 4 1 .. 995 1 996 1 997 1 998 1 999 1 Name: class, Length: 1000, dtype: int64
In [83]:
y.value_counts()
Out[83]:
1 500 0 500 Name: class, dtype: int64
In [ ]:
# 피처 스케일링
In [15]:
from sklearn.preprocessing import MinMaxScaler
In [16]:
scaler_X = MinMaxScaler()
In [17]:
scaler_X.fit_transform(X)
Out[17]:
array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503, 0.48333333], [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704, 0.16666667], [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938, 0.18333333], ..., [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462 , 0.07130658, 0.15 ], [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307, 0.43333333], [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556, 0.03333333]])
In [18]:
X = scaler_X.fit_transform(X)
In [19]:
X
Out[19]:
array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503, 0.48333333], [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704, 0.16666667], [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938, 0.18333333], ..., [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462 , 0.07130658, 0.15 ], [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307, 0.43333333], [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556, 0.03333333]])
In [84]:
# traing / test 로 분리
In [85]:
from sklearn.model_selection import train_test_split
In [86]:
train_test_split(X, y, test_size=0.2, random_state=1)
Out[86]:
[ Preg Plas Pres skin test mass pedi \ 382 1 109.000000 60.000000 8.00000 182.000000 25.400000 0.947000 994 8 139.090210 67.203012 29.15342 155.548223 34.689473 0.286294 982 1 94.873747 50.621242 29.39813 93.844689 42.340848 0.708513 47 2 71.000000 70.000000 27.00000 155.548223 28.000000 0.586000 521 3 124.000000 80.000000 33.00000 130.000000 33.200000 0.305000 .. ... ... ... ... ... ... ... 767 1 93.000000 70.000000 31.00000 155.548223 30.400000 0.315000 72 13 126.000000 90.000000 29.15342 155.548223 43.400000 0.583000 908 7 144.226665 69.066662 29.15342 155.548223 36.279998 0.168253 235 4 171.000000 72.000000 29.15342 155.548223 43.600000 0.479000 37 9 102.000000 76.000000 37.00000 155.548223 32.900000 0.665000 age 382 21 994 42 982 27 47 22 521 26 .. ... 767 23 72 42 908 41 235 26 37 46 [800 rows x 8 columns], Preg Plas Pres skin test mass pedi \ 507 1 130.000000 60.000000 23.000000 170.000000 28.600000 0.692000 818 6 99.403110 71.401791 29.153420 155.548223 30.238756 0.445441 452 0 91.000000 68.000000 32.000000 210.000000 39.900000 0.381000 368 3 81.000000 86.000000 16.000000 66.000000 27.500000 0.306000 242 3 139.000000 54.000000 29.153420 155.548223 25.600000 0.402000 .. ... ... ... ... ... ... ... 430 2 99.000000 72.405184 29.153420 155.548223 22.200000 0.108000 874 14 164.625108 74.363793 40.409052 114.886422 41.431853 0.791737 550 1 116.000000 70.000000 28.000000 155.548223 27.400000 0.204000 608 0 152.000000 82.000000 39.000000 272.000000 41.500000 0.270000 207 5 162.000000 104.000000 29.153420 155.548223 37.700000 0.151000 age 507 21 818 31 452 25 368 22 242 22 .. ... 430 23 874 43 550 21 608 27 207 52 [200 rows x 8 columns], 382 0 994 1 982 1 47 0 521 0 .. 767 0 72 1 908 1 235 1 37 1 Name: class, Length: 800, dtype: int64, 507 0 818 1 452 0 368 0 242 1 .. 430 0 874 1 550 0 608 0 207 1 Name: class, Length: 200, dtype: int64]
In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
In [89]:
# 모델링
In [88]:
from sklearn.linear_model import LogisticRegression
In [92]:
classifier = LogisticRegression(random_state=2)
In [93]:
classifier.fit(X_train,y_train)
C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\linear_model\_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
Out[93]:
LogisticRegression(random_state=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(random_state=2)
In [96]:
# 시험결과
In [97]:
y_pred = classifier.predict(X_test)
In [ ]:
# 검증
In [94]:
from sklearn.metrics import confusion_matrix, accuracy_score
In [100]:
cm = confusion_matrix(y_test, y_pred)
In [101]:
cm
Out[101]:
array([[77, 34], [21, 68]], dtype=int64)
In [99]:
accuracy_score(y_test, y_pred)
Out[99]:
0.725
In [103]:
sb.heatmap(data = cm, annot = True, cmap="RdPu", linewidths=0.7)
plt.show()
In [104]:
classifier.coef_
Out[104]:
array([[ 1.59034941e-01, 3.67687930e-02, -4.24544323e-02, -8.94296718e-03, -1.05028486e-03, 6.17648642e-02, 2.37095363e+00, 3.75308128e-03]])
In [105]:
classifier.intercept_
Out[105]:
array([-4.89066589])