DataScience/MachineLearning
Machine [unsupervised{Clustering(K-means)}] (평할/분할 기반의 군집)
leopard4
2022. 12. 5. 12:13

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 비슷한 성향끼리 묶는작업
In [4]:
df = pd.read_csv("../data/Mall_Customers.csv")
In [5]:
df.head()
Out[5]:
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 |
1 | 2 | Male | 21 | 15 | 81 |
2 | 3 | Female | 20 | 16 | 6 |
3 | 4 | Female | 23 | 16 | 77 |
4 | 5 | Female | 31 | 17 | 40 |
In [6]:
df.isna().sum()
Out[6]:
CustomerID 0 Genre 0 Age 0 Annual Income (k$) 0 Spending Score (1-100) 0 dtype: int64
In [7]:
df.describe()
Out[7]:
CustomerID | Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|---|
count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
mean | 100.500000 | 38.850000 | 60.560000 | 50.200000 |
std | 57.879185 | 13.969007 | 26.264721 | 25.823522 |
min | 1.000000 | 18.000000 | 15.000000 | 1.000000 |
25% | 50.750000 | 28.750000 | 41.500000 | 34.750000 |
50% | 100.500000 | 36.000000 | 61.500000 | 50.000000 |
75% | 150.250000 | 49.000000 | 78.000000 | 73.000000 |
max | 200.000000 | 70.000000 | 137.000000 | 99.000000 |
In [8]:
# X 만 셋팅, 왜냐?? 언수퍼바이즈드 러닝이니까, 당연히 y는 없다.
In [9]:
df.head(3)
Out[9]:
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 |
1 | 2 | Male | 21 | 15 | 81 |
2 | 3 | Female | 20 | 16 | 6 |
In [10]:
X = df.loc[ :, "Annual Income (k$)" : ]
In [11]:
X
Out[11]:
Annual Income (k$) | Spending Score (1-100) | |
---|---|---|
0 | 15 | 39 |
1 | 15 | 81 |
2 | 16 | 6 |
3 | 16 | 77 |
4 | 17 | 40 |
... | ... | ... |
195 | 120 | 79 |
196 | 126 | 28 |
197 | 126 | 74 |
198 | 137 | 18 |
199 | 137 | 83 |
200 rows × 2 columns
In [12]:
# 모델링
In [13]:
from sklearn.cluster import KMeans
In [14]:
kmeans = KMeans(n_clusters= 3, random_state=2) # 그룹의 갯수 n_clusters=
In [15]:
y_pred = kmeans.fit_predict(X)
C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
In [16]:
y_pred
Out[16]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1])
In [17]:
df["Group"] = y_pred
In [18]:
df
Out[18]:
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | Group | |
---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 0 |
1 | 2 | Male | 21 | 15 | 81 | 0 |
2 | 3 | Female | 20 | 16 | 6 | 0 |
3 | 4 | Female | 23 | 16 | 77 | 0 |
4 | 5 | Female | 31 | 17 | 40 | 0 |
... | ... | ... | ... | ... | ... | ... |
195 | 196 | Female | 35 | 120 | 79 | 1 |
196 | 197 | Female | 45 | 126 | 28 | 2 |
197 | 198 | Male | 32 | 126 | 74 | 1 |
198 | 199 | Male | 32 | 137 | 18 | 2 |
199 | 200 | Male | 30 | 137 | 83 | 1 |
200 rows × 6 columns
In [19]:
# 그룹번호 1번인사람만 가져와라.
In [20]:
df.loc[df["Group"] ==1 , ]
Out[20]:
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | Group | |
---|---|---|---|---|---|---|
123 | 124 | Male | 39 | 69 | 91 | 1 |
125 | 126 | Female | 31 | 70 | 77 | 1 |
127 | 128 | Male | 40 | 71 | 95 | 1 |
129 | 130 | Male | 38 | 71 | 75 | 1 |
131 | 132 | Male | 39 | 71 | 75 | 1 |
133 | 134 | Female | 31 | 72 | 71 | 1 |
135 | 136 | Female | 29 | 73 | 88 | 1 |
137 | 138 | Male | 32 | 73 | 73 | 1 |
139 | 140 | Female | 35 | 74 | 72 | 1 |
141 | 142 | Male | 32 | 75 | 93 | 1 |
143 | 144 | Female | 32 | 76 | 87 | 1 |
145 | 146 | Male | 28 | 77 | 97 | 1 |
147 | 148 | Female | 32 | 77 | 74 | 1 |
149 | 150 | Male | 34 | 78 | 90 | 1 |
151 | 152 | Male | 39 | 78 | 88 | 1 |
153 | 154 | Female | 38 | 78 | 76 | 1 |
155 | 156 | Female | 27 | 78 | 89 | 1 |
157 | 158 | Female | 30 | 78 | 78 | 1 |
159 | 160 | Female | 30 | 78 | 73 | 1 |
161 | 162 | Female | 29 | 79 | 83 | 1 |
163 | 164 | Female | 31 | 81 | 93 | 1 |
165 | 166 | Female | 36 | 85 | 75 | 1 |
167 | 168 | Female | 33 | 86 | 95 | 1 |
169 | 170 | Male | 32 | 87 | 63 | 1 |
171 | 172 | Male | 28 | 87 | 75 | 1 |
173 | 174 | Male | 36 | 87 | 92 | 1 |
175 | 176 | Female | 30 | 88 | 86 | 1 |
177 | 178 | Male | 27 | 88 | 69 | 1 |
179 | 180 | Male | 35 | 93 | 90 | 1 |
181 | 182 | Female | 32 | 97 | 86 | 1 |
183 | 184 | Female | 29 | 98 | 88 | 1 |
185 | 186 | Male | 30 | 99 | 97 | 1 |
187 | 188 | Male | 28 | 101 | 68 | 1 |
189 | 190 | Female | 36 | 103 | 85 | 1 |
191 | 192 | Female | 32 | 103 | 69 | 1 |
193 | 194 | Female | 38 | 113 | 91 | 1 |
195 | 196 | Female | 35 | 120 | 79 | 1 |
197 | 198 | Male | 32 | 126 | 74 | 1 |
199 | 200 | Male | 30 | 137 | 83 | 1 |
In [ ]:
In [21]:
# 마케팅 현업에서 자주 사용하는 방법 WCSS
# 그룹의 최적화된 갯수를 구하는방법이다.
In [24]:
wcss = []
for k in np.arange(1, 10+1) :
kmeans = KMeans(n_clusters = k , random_state=5)
kmeans.fit(X)
wcss.append( kmeans.inertia_)
C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
In [25]:
wcss
Out[25]:
[269981.28, 181363.59595959596, 106348.37306211119, 73679.78903948834, 44448.45544793371, 37233.81451071001, 30259.65720728547, 25079.7669621159, 21850.165282585636, 19712.851860217077]
In [ ]:
# 위의 각 클러스터의 갯수마다 구현
# wcss 값을, 차트로 나타낸다 => 엘보우 메소드라고 한다.
In [27]:
In [29]:
x = np.arange(1, 10+1)
plt.plot(x, wcss)
plt.title("The Elbow Method")
plt.xlabel('Number of Clusters')
plt.ylabel("WCSS")
plt.show()
In [30]:
kmeans = KMeans(n_clusters = 5 , random_state =5 )
In [32]:
y_pred = kmeans.fit_predict(X)
C:\Users\5-10\Anaconda3\envs\YH\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
In [33]:
y_pred
Out[33]:
array([3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 1, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 2, 1, 2, 4, 2, 4, 2, 1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2])
In [34]:
df["Group"] = y_pred
In [35]:
df
Out[35]:
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | Group | |
---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 3 |
1 | 2 | Male | 21 | 15 | 81 | 0 |
2 | 3 | Female | 20 | 16 | 6 | 3 |
3 | 4 | Female | 23 | 16 | 77 | 0 |
4 | 5 | Female | 31 | 17 | 40 | 3 |
... | ... | ... | ... | ... | ... | ... |
195 | 196 | Female | 35 | 120 | 79 | 2 |
196 | 197 | Female | 45 | 126 | 28 | 4 |
197 | 198 | Male | 32 | 126 | 74 | 2 |
198 | 199 | Male | 32 | 137 | 18 | 4 |
199 | 200 | Male | 30 | 137 | 83 | 2 |
200 rows × 6 columns
In [ ]:
In [37]:
import seaborn as sb
In [39]:
sb.scatterplot(data = df , x = 'Annual Income (k$)', y = 'Spending Score (1-100)')
plt.show()
In [ ]:
In [40]:
plt.figure(figsize=[12,8])
plt.scatter(X.values[y_pred == 0, 0], X.values[y_pred == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X.values[y_pred == 1, 0], X.values[y_pred == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X.values[y_pred == 2, 0], X.values[y_pred == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X.values[y_pred == 3, 0], X.values[y_pred == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X.values[y_pred == 4, 0], X.values[y_pred == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
In [ ]:
In [ ]:
# 1. 그룹이 3인 사람들의 데이터를 가져오세요.
In [42]:
df.head(1)
Out[42]:
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | Group | |
---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 3 |
In [44]:
df.loc[df["Group"] == 3 , ]
Out[44]:
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | Group | |
---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 3 |
2 | 3 | Female | 20 | 16 | 6 | 3 |
4 | 5 | Female | 31 | 17 | 40 | 3 |
6 | 7 | Female | 35 | 18 | 6 | 3 |
8 | 9 | Male | 64 | 19 | 3 | 3 |
10 | 11 | Male | 67 | 19 | 14 | 3 |
12 | 13 | Female | 58 | 20 | 15 | 3 |
14 | 15 | Male | 37 | 20 | 13 | 3 |
16 | 17 | Female | 35 | 21 | 35 | 3 |
18 | 19 | Male | 52 | 23 | 29 | 3 |
20 | 21 | Male | 35 | 24 | 35 | 3 |
22 | 23 | Female | 46 | 25 | 5 | 3 |
24 | 25 | Female | 54 | 28 | 14 | 3 |
26 | 27 | Female | 45 | 28 | 32 | 3 |
28 | 29 | Female | 40 | 29 | 31 | 3 |
30 | 31 | Male | 60 | 30 | 4 | 3 |
32 | 33 | Male | 53 | 33 | 4 | 3 |
34 | 35 | Female | 49 | 33 | 14 | 3 |
36 | 37 | Female | 42 | 34 | 17 | 3 |
38 | 39 | Female | 36 | 37 | 26 | 3 |
40 | 41 | Female | 65 | 38 | 35 | 3 |
42 | 43 | Male | 48 | 39 | 36 | 3 |
44 | 45 | Female | 49 | 39 | 28 | 3 |
In [49]:
df_group_3 = df.loc[df["Group"] == 3 , ]
In [41]:
# 2. 그룹이 3인 사람들의 수입 평균은 얼마입니까?
In [50]:
df_group_3["Annual Income (k$)"].mean()
Out[50]:
26.304347826086957
In [46]:
df.groupby("Group")["Annual Income (k$)"].mean()
Out[46]:
Group 0 25.727273 1 55.296296 2 86.538462 3 26.304348 4 88.200000 Name: Annual Income (k$), dtype: float64
In [ ]:
# 3. 그룹별 Spending Score 를 나타내되
# 평균값과 최대값을 한번에 보여주세요.
In [47]:
df.groupby("Group")["Spending Score (1-100)"].describe()
Out[47]:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Group | ||||||||
0 | 22.0 | 79.363636 | 10.504174 | 61.0 | 73.0 | 77.0 | 85.75 | 99.0 |
1 | 81.0 | 49.518519 | 6.530909 | 34.0 | 44.0 | 50.0 | 55.00 | 61.0 |
2 | 39.0 | 82.128205 | 9.364489 | 63.0 | 74.5 | 83.0 | 90.00 | 97.0 |
3 | 23.0 | 20.913043 | 13.017167 | 3.0 | 9.5 | 17.0 | 33.50 | 40.0 |
4 | 35.0 | 17.114286 | 9.952154 | 1.0 | 10.0 | 16.0 | 23.50 | 39.0 |
In [51]:
df.groupby("Group")["Spending Score (1-100)"].agg( [ np.mean, np.max])
Out[51]:
mean | amax | |
---|---|---|
Group | ||
0 | 79.363636 | 99 |
1 | 49.518519 | 61 |
2 | 82.128205 | 97 |
3 | 20.913043 | 40 |
4 | 17.114286 | 39 |
In [52]:
# 각 그룹별, 수입과 소비지표의 평균을 구하시오.
In [53]:
df.head(1)
Out[53]:
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | Group | |
---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 3 |
In [55]:
df.groupby("Group")[['Annual Income (k$)','Spending Score (1-100)']].describe()
Out[55]:
Annual Income (k$) | Spending Score (1-100) | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
Group | ||||||||||||||||
0 | 22.0 | 25.727273 | 7.566731 | 15.0 | 19.25 | 24.5 | 32.25 | 39.0 | 22.0 | 79.363636 | 10.504174 | 61.0 | 73.0 | 77.0 | 85.75 | 99.0 |
1 | 81.0 | 55.296296 | 8.988109 | 39.0 | 48.00 | 54.0 | 62.00 | 76.0 | 81.0 | 49.518519 | 6.530909 | 34.0 | 44.0 | 50.0 | 55.00 | 61.0 |
2 | 39.0 | 86.538462 | 16.312485 | 69.0 | 75.50 | 79.0 | 95.00 | 137.0 | 39.0 | 82.128205 | 9.364489 | 63.0 | 74.5 | 83.0 | 90.00 | 97.0 |
3 | 23.0 | 26.304348 | 7.893811 | 15.0 | 19.50 | 25.0 | 33.00 | 39.0 | 23.0 | 20.913043 | 13.017167 | 3.0 | 9.5 | 17.0 | 33.50 | 40.0 |
4 | 35.0 | 88.200000 | 16.399067 | 70.0 | 77.50 | 85.0 | 97.50 | 137.0 | 35.0 | 17.114286 | 9.952154 | 1.0 | 10.0 | 16.0 | 23.50 | 39.0 |
In [59]:
pd.pivot_table(df, index="Group", aggfunc = np.mean)
Out[59]:
Age | Annual Income (k$) | CustomerID | Spending Score (1-100) | |
---|---|---|---|---|
Group | ||||
0 | 25.272727 | 25.727273 | 23.090909 | 79.363636 |
1 | 42.716049 | 55.296296 | 86.320988 | 49.518519 |
2 | 32.692308 | 86.538462 | 162.000000 | 82.128205 |
3 | 45.217391 | 26.304348 | 23.000000 | 20.913043 |
4 | 41.114286 | 88.200000 | 164.371429 | 17.114286 |
In [60]:
pd.pivot_table(df, index="Group", aggfunc = np.mean, values = ['Annual Income (k$)','Spending Score (1-100)'])
Out[60]:
Annual Income (k$) | Spending Score (1-100) | |
---|---|---|
Group | ||
0 | 25.727273 | 79.363636 |
1 | 55.296296 | 49.518519 |
2 | 86.538462 | 82.128205 |
3 | 26.304348 | 20.913043 |
4 | 88.200000 | 17.114286 |