
In [1]:
In [ ]:
import library¶
In [2]:
# Data Preprocessing Template
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import the dataset¶
In [4]:
df = pd.read_csv("../data/Data.csv")
In [5]:
df
Out[5]:
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | France | 44.0 | 72000.0 | No |
1 | Spain | 27.0 | 48000.0 | Yes |
2 | Germany | 30.0 | 54000.0 | No |
3 | Spain | 38.0 | 61000.0 | No |
4 | Germany | 40.0 | NaN | Yes |
5 | France | 35.0 | 58000.0 | Yes |
6 | Spain | NaN | 52000.0 | No |
7 | France | 48.0 | 79000.0 | Yes |
8 | Germany | 50.0 | 83000.0 | No |
9 | France | 37.0 | 67000.0 | Yes |
In [6]:
df.describe()
Out[6]:
Age | Salary | |
---|---|---|
count | 9.000000 | 9.000000 |
mean | 38.777778 | 63777.777778 |
std | 7.693793 | 12265.579662 |
min | 27.000000 | 48000.000000 |
25% | 35.000000 | 54000.000000 |
50% | 38.000000 | 61000.000000 |
75% | 44.000000 | 72000.000000 |
max | 50.000000 | 83000.000000 |
In [10]:
df["Country"].describe()
Out[10]:
count 10 unique 3 top France freq 4 Name: Country, dtype: object
In [11]:
df["Country"].unique()
Out[11]:
array(['France', 'Spain', 'Germany'], dtype=object)
In [12]:
df["Purchased"].describe()
Out[12]:
count 10 unique 2 top No freq 5 Name: Purchased, dtype: object
In [13]:
df["Purchased"].unique()
Out[13]:
array(['No', 'Yes'], dtype=object)
In [ ]:
NaN 처리¶
In [14]:
df.isna().sum()
Out[14]:
Country 0 Age 1 Salary 1 Purchased 0 dtype: int64
In [15]:
# 1. 삭제전략
In [25]:
df.dropna(inplace=True)
In [16]:
# 2. 채우는 전략
In [23]:
df.fillna(df.mean(numeric_only=True)) ## 메세지없애기 위해 numeric_only =>숫자만
Out[23]:
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | France | 44.000000 | 72000.000000 | No |
1 | Spain | 27.000000 | 48000.000000 | Yes |
2 | Germany | 30.000000 | 54000.000000 | No |
3 | Spain | 38.000000 | 61000.000000 | No |
4 | Germany | 40.000000 | 63777.777778 | Yes |
5 | France | 35.000000 | 58000.000000 | Yes |
6 | Spain | 38.777778 | 52000.000000 | No |
7 | France | 48.000000 | 79000.000000 | Yes |
8 | Germany | 50.000000 | 83000.000000 | No |
9 | France | 37.000000 | 67000.000000 | Yes |
In [26]:
df
Out[26]:
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | France | 44.0 | 72000.0 | No |
1 | Spain | 27.0 | 48000.0 | Yes |
2 | Germany | 30.0 | 54000.0 | No |
3 | Spain | 38.0 | 61000.0 | No |
5 | France | 35.0 | 58000.0 | Yes |
7 | France | 48.0 | 79000.0 | Yes |
8 | Germany | 50.0 | 83000.0 | No |
9 | France | 37.0 | 67000.0 | Yes |
In [ ]:
In [ ]:
In [ ]:
X, Y 데이터 분리 : 즉 학습할 변수와 레이블링 변수로 분리¶
In [27]:
# 예측하고자 하는 컬럼 = 소문자 y [세계공통]
# 예측을 위해 제공하는 데이터 = 대문자 X [세계공통]
df
Out[27]:
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | France | 44.0 | 72000.0 | No |
1 | Spain | 27.0 | 48000.0 | Yes |
2 | Germany | 30.0 | 54000.0 | No |
3 | Spain | 38.0 | 61000.0 | No |
5 | France | 35.0 | 58000.0 | Yes |
7 | France | 48.0 | 79000.0 | Yes |
8 | Germany | 50.0 | 83000.0 | No |
9 | France | 37.0 | 67000.0 | Yes |
In [29]:
X = df.loc[: , "Country":'Salary']
In [31]:
y = df["Purchased"]
In [32]:
X
Out[32]:
Country | Age | Salary | |
---|---|---|---|
0 | France | 44.0 | 72000.0 |
1 | Spain | 27.0 | 48000.0 |
2 | Germany | 30.0 | 54000.0 |
3 | Spain | 38.0 | 61000.0 |
5 | France | 35.0 | 58000.0 |
7 | France | 48.0 | 79000.0 |
8 | Germany | 50.0 | 83000.0 |
9 | France | 37.0 | 67000.0 |
In [33]:
y
Out[33]:
0 No 1 Yes 2 No 3 No 5 Yes 7 Yes 8 No 9 Yes Name: Purchased, dtype: object
In [ ]:
# 데이터를 학습하기 위해서는,
# 방정식에 대입되어야 하는데,
# 방정식은 수학식이므로, 데이터는 모두 숫자로 되어있어야 한다.
# 따라서, 문자열 데이터를 숫자로 바꿔줘야 한다.
In [34]:
df
Out[34]:
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | France | 44.0 | 72000.0 | No |
1 | Spain | 27.0 | 48000.0 | Yes |
2 | Germany | 30.0 | 54000.0 | No |
3 | Spain | 38.0 | 61000.0 | No |
5 | France | 35.0 | 58000.0 | Yes |
7 | France | 48.0 | 79000.0 | Yes |
8 | Germany | 50.0 | 83000.0 | No |
9 | France | 37.0 | 67000.0 | Yes |
In [36]:
# 컨트리 컬럼은, 3개의 문자열로 되어있는 카테고리컬 데이터다.
# 따라서, 어떤 데이터로 되어있는지 확인하되,
# 정렬해서 확인한다.
sorted(X["Country"].unique())
Out[36]:
['France', 'Germany', 'Spain']
In [ ]:
# 'France' => 0
# 'Germany' => 1
# 'Spain' => 2
# 정렬해서, 순서대로, 0부터 시작하는 숫자로 바꿔준다 => Label Encoding
In [ ]:
# 레이블 인코딩으로 변환한 후에, 학습을 시켰을때....
# 카테고리컬 데이터의 갯수가 3개 이상일때는,
# 레이블 인코딩으로 학습하면 학습이 잘 안된다.
# 이렇게 3개 이상의 카테고리컬 데이터는 One-Hot Encoding을
# 이용해서 처리하면, 학습이 잘 된다.
In [37]:
X
Out[37]:
Country | Age | Salary | |
---|---|---|---|
0 | France | 44.0 | 72000.0 |
1 | Spain | 27.0 | 48000.0 |
2 | Germany | 30.0 | 54000.0 |
3 | Spain | 38.0 | 61000.0 |
5 | France | 35.0 | 58000.0 |
7 | France | 48.0 | 79000.0 |
8 | Germany | 50.0 | 83000.0 |
9 | France | 37.0 | 67000.0 |
In [ ]:
# France Germany Spain Age Salary
# 1 0 0 44 72000
# 0 0 1 27 48000
# 0 1 0
# 0 0 1
In [38]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
In [39]:
from sklearn.compose import ColumnTransformer
In [40]:
# 1. 레이블 인코딩 하는 방법
In [42]:
encoder = LabelEncoder()
In [47]:
encoder.fit_transform(X["Country"])
Out[47]:
array([0, 2, 1, 2, 0, 0, 1, 0], dtype=int64)
In [45]:
X["Country"] = encoder.fit_transform(X["Country"])
In [46]:
X
Out[46]:
Country | Age | Salary | |
---|---|---|---|
0 | 0 | 44.0 | 72000.0 |
1 | 2 | 27.0 | 48000.0 |
2 | 1 | 30.0 | 54000.0 |
3 | 2 | 38.0 | 61000.0 |
5 | 0 | 35.0 | 58000.0 |
7 | 0 | 48.0 | 79000.0 |
8 | 1 | 50.0 | 83000.0 |
9 | 0 | 37.0 | 67000.0 |
In [48]:
X= df.loc[ : , 'Country' : 'Salary' ]
In [49]:
X
Out[49]:
Country | Age | Salary | |
---|---|---|---|
0 | France | 44.0 | 72000.0 |
1 | Spain | 27.0 | 48000.0 |
2 | Germany | 30.0 | 54000.0 |
3 | Spain | 38.0 | 61000.0 |
5 | France | 35.0 | 58000.0 |
7 | France | 48.0 | 79000.0 |
8 | Germany | 50.0 | 83000.0 |
9 | France | 37.0 | 67000.0 |
'DataScience > MachineLearning' 카테고리의 다른 글
Machine 예측 모델 실습, 배포를 위한 저장 (0) | 2022.12.01 |
---|---|
Machine Multiple Linear Regression (0) | 2022.12.01 |
Machine [supervised{Prediction(Linear Regression)}] (0) | 2022.12.01 |
Machine preprocessing, Feature Scaling, Dataset Training & Test (0) | 2022.12.01 |
Machine 원핫 인코딩 (One Hot Encoding) (0) | 2022.12.01 |