import pandas as pd


df = pd.read_csv("../data/pima-indians-diabetes.csv")


df.describe() # 데이터값이 이상한 부분이있다.

df


df.isna().sum()

Preg     0
Plas     0
Pres     0
skin     0
test     0
mass     0
pedi     0
age      0
class    0
dtype: int64


# 데이터가 없어서 0으로 세팅하는경우가 실제로 많이 있다.
# 그래서, 0으로 셋팅된 값을, Nan으로 바꿔준다.


import numpy as np


df.loc[ :, "Plas":"age"].replace(0, np.nan)


df.loc[ :, "Plas":"age"] = df.loc[ :, "Plas":"age"].replace(0, np.nan)


df.isna().sum()

Preg       0
Plas       5
Pres      35
skin     227
test     374
mass      11
pedi       0
age        0
class      0
dtype: int64


# 1. nan 을 없애고, 인공지능 개발 (가장 best 이긴하다.)


# 2. 다른값으로 채우는 방법, 여기선 각 열의 평균으로 채웠다.


df.mean()

Preg       3.845052
Plas     121.686763
Pres      72.405184
skin      29.153420
test     155.548223
mass      32.457464
pedi       0.471876
age       33.240885
class      0.348958
dtype: float64


df.fillna( df.mean(), inplace=True)

df


X = df.loc[ : , "Preg":"age"]


y = df["class"]

X

y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: class, Length: 768, dtype: int64


y_val = y.value_counts()


y_val

0    500
1    268
Name: class, dtype: int64


import matplotlib.pyplot as plt
import seaborn as sb


plt.pie(y_val , labels= y_val.index , autopct= "%.0f" , startangle= 90,
        wedgeprops= {"width" : 0.8})

plt.legend([0, 1])
plt.show()


# 데이터의 불균형
sb.countplot(data = df, x="class")
plt.show()


# 당뇨병인 사람의 데이터가 훨씬 적으니까,
# up sampling 기법으로, 당뇨병 데이터를 늘린다.


# imblearn 라이브러리를 이용하기 위해서 설치.


from imblearn.over_sampling import SMOTE


sm = SMOTE(random_state=2)  # 데이터셋을 2로 맞춘다.


sm.fit_resample(X,y)

(     Preg        Plas       Pres       skin        test       mass      pedi  \
 0       6  148.000000  72.000000  35.000000  155.548223  33.600000  0.627000   
 1       1   85.000000  66.000000  29.000000  155.548223  26.600000  0.351000   
 2       8  183.000000  64.000000  29.153420  155.548223  23.300000  0.672000   
 3       1   89.000000  66.000000  23.000000   94.000000  28.100000  0.167000   
 4       0  137.000000  40.000000  35.000000  168.000000  43.100000  2.288000   
 ..    ...         ...        ...        ...         ...        ...       ...   
 995     4  112.068692  72.931308  34.396961  104.793923  31.326706  0.583229   
 996     9  123.496400  57.282629  30.617023  155.548223  34.988795  0.970132   
 997     8  149.874944  82.500045  45.750067  238.499955  36.475033  0.972876   
 998     5   97.007387  84.532841  30.670194  155.548223  31.629886  0.785759   
 999     0   97.323977  80.545710  22.676023   38.323977  34.591861  0.330470   
 
      age  
 0     50  
 1     31  
 2     32  
 3     21  
 4     33  
 ..   ...  
 995   31  
 996   34  
 997   46  
 998   58  
 999   23  
 
 [1000 rows x 8 columns],
 0      1
 1      0
 2      1
 3      0
 4      1
       ..
 995    1
 996    1
 997    1
 998    1
 999    1
 Name: class, Length: 1000, dtype: int64)


X, y = sm.fit_resample(X,y)

X

y

0      1
1      0
2      1
3      0
4      1
      ..
995    1
996    1
997    1
998    1
999    1
Name: class, Length: 1000, dtype: int64


y.value_counts()

1    500
0    500
Name: class, dtype: int64


# 피처 스케일링


from sklearn.preprocessing import MinMaxScaler


scaler_X = MinMaxScaler()


scaler_X.fit_transform(X)

array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503,
        0.48333333],
       [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704,
        0.16666667],
       [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938,
        0.18333333],
       ...,
       [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462  , 0.07130658,
        0.15      ],
       [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307,
        0.43333333],
       [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556,
        0.03333333]])


X = scaler_X.fit_transform(X)

X

array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503,
        0.48333333],
       [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704,
        0.16666667],
       [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938,
        0.18333333],
       ...,
       [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462  , 0.07130658,
        0.15      ],
       [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307,
        0.43333333],
       [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556,
        0.03333333]])


# traing / test 로 분리


from sklearn.model_selection import train_test_split


train_test_split(X, y, test_size=0.2, random_state=1)

[     Preg        Plas       Pres      skin        test       mass      pedi  \
 382     1  109.000000  60.000000   8.00000  182.000000  25.400000  0.947000   
 994     8  139.090210  67.203012  29.15342  155.548223  34.689473  0.286294   
 982     1   94.873747  50.621242  29.39813   93.844689  42.340848  0.708513   
 47      2   71.000000  70.000000  27.00000  155.548223  28.000000  0.586000   
 521     3  124.000000  80.000000  33.00000  130.000000  33.200000  0.305000   
 ..    ...         ...        ...       ...         ...        ...       ...   
 767     1   93.000000  70.000000  31.00000  155.548223  30.400000  0.315000   
 72     13  126.000000  90.000000  29.15342  155.548223  43.400000  0.583000   
 908     7  144.226665  69.066662  29.15342  155.548223  36.279998  0.168253   
 235     4  171.000000  72.000000  29.15342  155.548223  43.600000  0.479000   
 37      9  102.000000  76.000000  37.00000  155.548223  32.900000  0.665000   
 
      age  
 382   21  
 994   42  
 982   27  
 47    22  
 521   26  
 ..   ...  
 767   23  
 72    42  
 908   41  
 235   26  
 37    46  
 
 [800 rows x 8 columns],
      Preg        Plas        Pres       skin        test       mass      pedi  \
 507     1  130.000000   60.000000  23.000000  170.000000  28.600000  0.692000   
 818     6   99.403110   71.401791  29.153420  155.548223  30.238756  0.445441   
 452     0   91.000000   68.000000  32.000000  210.000000  39.900000  0.381000   
 368     3   81.000000   86.000000  16.000000   66.000000  27.500000  0.306000   
 242     3  139.000000   54.000000  29.153420  155.548223  25.600000  0.402000   
 ..    ...         ...         ...        ...         ...        ...       ...   
 430     2   99.000000   72.405184  29.153420  155.548223  22.200000  0.108000   
 874    14  164.625108   74.363793  40.409052  114.886422  41.431853  0.791737   
 550     1  116.000000   70.000000  28.000000  155.548223  27.400000  0.204000   
 608     0  152.000000   82.000000  39.000000  272.000000  41.500000  0.270000   
 207     5  162.000000  104.000000  29.153420  155.548223  37.700000  0.151000   
 
      age  
 507   21  
 818   31  
 452   25  
 368   22  
 242   22  
 ..   ...  
 430   23  
 874   43  
 550   21  
 608   27  
 207   52  
 
 [200 rows x 8 columns],
 382    0
 994    1
 982    1
 47     0
 521    0
       ..
 767    0
 72     1
 908    1
 235    1
 37     1
 Name: class, Length: 800, dtype: int64,
 507    0
 818    1
 452    0
 368    0
 242    1
       ..
 430    0
 874    1
 550    0
 608    0
 207    1
 Name: class, Length: 200, dtype: int64]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


# 모델링


from sklearn.linear_model import LogisticRegression


classifier = LogisticRegression(random_state=2)


classifier.fit(X_train,y_train)

C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\linear_model\_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

LogisticRegression(random_state=2)

LogisticRegression(random_state=2)


# 시험결과


y_pred = classifier.predict(X_test)


# 검증


from sklearn.metrics import confusion_matrix, accuracy_score


cm = confusion_matrix(y_test, y_pred)

cm

array([[77, 34],
       [21, 68]], dtype=int64)


accuracy_score(y_test, y_pred)

0.725


sb.heatmap(data = cm, annot = True, cmap="RdPu", linewidths=0.7)
plt.show()


classifier.coef_

array([[ 1.59034941e-01,  3.67687930e-02, -4.24544323e-02,
        -8.94296718e-03, -1.05028486e-03,  6.17648642e-02,
         2.37095363e+00,  3.75308128e-03]])


classifier.intercept_

array([-4.89066589])

	Preg	Plas	Pres	skin	test	mass	pedi	age	class
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	120.894531	69.105469	20.536458	79.799479	31.992578	0.471876	33.240885	0.348958
std	3.369578	31.972618	19.355807	15.952218	115.244002	7.884160	0.331329	11.760232	0.476951
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.078000	21.000000	0.000000
25%	1.000000	99.000000	62.000000	0.000000	0.000000	27.300000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	23.000000	30.500000	32.000000	0.372500	29.000000	0.000000
75%	6.000000	140.250000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000

	Preg	Plas	Pres	skin	test	mass	pedi	age	class
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1
...	...	...	...	...	...	...	...	...	...
763	10	101	76	48	180	32.9	0.171	63	0
764	2	122	70	27	0	36.8	0.340	27	0
765	5	121	72	23	112	26.2	0.245	30	0
766	1	126	60	0	0	30.1	0.349	47	1
767	1	93	70	31	0	30.4	0.315	23	0

	Plas	Pres	skin	test	mass	pedi	age
0	148.0	72.0	35.0	NaN	33.6	0.627	50
1	85.0	66.0	29.0	NaN	26.6	0.351	31
2	183.0	64.0	NaN	NaN	23.3	0.672	32
3	89.0	66.0	23.0	94.0	28.1	0.167	21
4	137.0	40.0	35.0	168.0	43.1	2.288	33
...	...	...	...	...	...	...	...
763	101.0	76.0	48.0	180.0	32.9	0.171	63
764	122.0	70.0	27.0	NaN	36.8	0.340	27
765	121.0	72.0	23.0	112.0	26.2	0.245	30
766	126.0	60.0	NaN	NaN	30.1	0.349	47
767	93.0	70.0	31.0	NaN	30.4	0.315	23

	Preg	Plas	Pres	skin	test	mass	pedi	age	class
0	6	148.0	72.0	35.00000	155.548223	33.6	0.627	50	1
1	1	85.0	66.0	29.00000	155.548223	26.6	0.351	31	0
2	8	183.0	64.0	29.15342	155.548223	23.3	0.672	32	1
3	1	89.0	66.0	23.00000	94.000000	28.1	0.167	21	0
4	0	137.0	40.0	35.00000	168.000000	43.1	2.288	33	1
...	...	...	...	...	...	...	...	...	...
763	10	101.0	76.0	48.00000	180.000000	32.9	0.171	63	0
764	2	122.0	70.0	27.00000	155.548223	36.8	0.340	27	0
765	5	121.0	72.0	23.00000	112.000000	26.2	0.245	30	0
766	1	126.0	60.0	29.15342	155.548223	30.1	0.349	47	1
767	1	93.0	70.0	31.00000	155.548223	30.4	0.315	23	0

	Preg	Plas	Pres	skin	test	mass	pedi	age
0	6	148.000000	72.000000	35.000000	155.548223	33.600000	0.627000	50
1	1	85.000000	66.000000	29.000000	155.548223	26.600000	0.351000	31
2	8	183.000000	64.000000	29.153420	155.548223	23.300000	0.672000	32
3	1	89.000000	66.000000	23.000000	94.000000	28.100000	0.167000	21
4	0	137.000000	40.000000	35.000000	168.000000	43.100000	2.288000	33
...	...	...	...	...	...	...	...	...
995	4	112.068692	72.931308	34.396961	104.793923	31.326706	0.583229	31
996	9	123.496400	57.282629	30.617023	155.548223	34.988795	0.970132	34
997	8	149.874944	82.500045	45.750067	238.499955	36.475033	0.972876	46
998	5	97.007387	84.532841	30.670194	155.548223	31.629886	0.785759	58
999	0	97.323977	80.545710	22.676023	38.323977	34.591861	0.330470	23

Machine Logistic Regression 데이터의 결점보완(0,nan), 데이터의 불균형 up sampling 기법, 결과를 히트맵으로 표현