
In [1]:
import pandas as pd
In [3]:
df = pd.read_csv("../data/Social_Network_Ads.csv")
In [4]:
df
Out[4]:
User ID | Gender | Age | EstimatedSalary | Purchased | |
---|---|---|---|---|---|
0 | 15624510 | Male | 19 | 19000 | 0 |
1 | 15810944 | Male | 35 | 20000 | 0 |
2 | 15668575 | Female | 26 | 43000 | 0 |
3 | 15603246 | Female | 27 | 57000 | 0 |
4 | 15804002 | Male | 19 | 76000 | 0 |
... | ... | ... | ... | ... | ... |
395 | 15691863 | Female | 46 | 41000 | 1 |
396 | 15706071 | Male | 51 | 23000 | 1 |
397 | 15654296 | Female | 50 | 20000 | 1 |
398 | 15755018 | Male | 36 | 33000 | 0 |
399 | 15594041 | Female | 49 | 36000 | 1 |
400 rows × 5 columns
In [5]:
df.isna().sum()
Out[5]:
User ID 0 Gender 0 Age 0 EstimatedSalary 0 Purchased 0 dtype: int64
In [6]:
df.describe()
Out[6]:
User ID | Age | EstimatedSalary | Purchased | |
---|---|---|---|---|
count | 4.000000e+02 | 400.000000 | 400.000000 | 400.000000 |
mean | 1.569154e+07 | 37.655000 | 69742.500000 | 0.357500 |
std | 7.165832e+04 | 10.482877 | 34096.960282 | 0.479864 |
min | 1.556669e+07 | 18.000000 | 15000.000000 | 0.000000 |
25% | 1.562676e+07 | 29.750000 | 43000.000000 | 0.000000 |
50% | 1.569434e+07 | 37.000000 | 70000.000000 | 0.000000 |
75% | 1.575036e+07 | 46.000000 | 88000.000000 | 1.000000 |
max | 1.581524e+07 | 60.000000 | 150000.000000 | 1.000000 |
In [7]:
y = df["Purchased"]
In [12]:
X = df.loc[ : ,["Age", "EstimatedSalary"] ]
In [13]:
from sklearn.preprocessing import MinMaxScaler
In [14]:
scaler_X = MinMaxScaler()
In [15]:
X = scaler_X.fit_transform(X)
In [18]:
y.value_counts()
Out[18]:
0 257 1 143 Name: Purchased, dtype: int64
In [19]:
from sklearn.model_selection import train_test_split
In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
In [ ]:
In [21]:
from sklearn.neighbors import KNeighborsClassifier
In [22]:
# 가장 가까운 5개 데이터로 분류
classifier = KNeighborsClassifier()
In [23]:
classifier.fit(X_train,y_train)
Out[23]:
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [24]:
y_pred = classifier.predict(X_test)
In [25]:
y_pred
Out[25]:
array([0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0], dtype=int64)
In [27]:
y_test.values
Out[27]:
array([0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0], dtype=int64)
In [ ]:
In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score
In [30]:
cm = confusion_matrix(y_test, y_pred)
In [31]:
cm
Out[31]:
array([[50, 8], [ 4, 38]], dtype=int64)
In [32]:
accuracy_score(y_test,y_pred)
Out[32]:
0.88
In [ ]:
In [36]:
import numpy as np
import matplotlib.pyplot as plt
In [37]:
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1,
stop = X_set[:, 1].max() + 1, step = 0.01))
plt.figure(figsize=[10,7])
plt.contourf(X1, X2, classifier.predict(
np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Classifier (Test set)')
plt.legend()
plt.show()
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points. *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.