09.a 강의 pandas

Dealing with NaN¶

In [150]:

# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]

In [152]:

pd.DataFrame(data= items2)

Out[152]:

	bikes	pants	watches	shirts	shoes	suits	glasses
0	20	30	35	15.0	8	45.0	NaN
1	15	5	10	2.0	5	7.0	50.0
2	20	30	35	NaN	10	NaN	4.0

In [151]:

pd.DataFrame(data= items2 , index= ["store 1","store 2","store 3"])

Out[151]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	NaN
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	NaN	10	NaN	4.0

In [153]:

df = pd.DataFrame(data= items2 , index= ["store 1","store 2","store 3"])

In [154]:

df

Out[154]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	NaN
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	NaN	10	NaN	4.0

In [155]:

# 비어있는 데이터가, 어디에, 몇개나 있는지 먼저 파악해야 한다.

In [156]:

df.isna()  

Out[156]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	False	False	False	False	False	False	True
store 2	False	False	False	False	False	False	False
store 3	False	False	False	True	False	True	False

In [157]:

df.isnull()  # 같은동작을 수행하지만 isna()가 최신

Out[157]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	False	False	False	False	False	False	True
store 2	False	False	False	False	False	False	False
store 3	False	False	False	True	False	True	False

In [158]:

df.isna().sum()    # 넘파이의 sum()

Out[158]:

bikes      0
pants      0
watches    0
shirts     1
shoes      0
suits      1
glasses    1
dtype: int64

In [160]:

df.isna().sum().sum()

Out[160]:

In [ ]:

In [161]:

# NaN 을 처리하는 전략

In [162]:

# 1. 삭제하는 전략

In [165]:

df

Out[165]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	NaN
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	NaN	10	NaN	4.0

In [164]:

df.dropna()   # 행으로 nan 을 삭제 

Out[164]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 2	15	5	10	2.0	5	7.0	50.0

In [ ]:

# 2. 특정 값으로 채우는 전략.

In [167]:

df.fillna(0)

Out[167]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	0.0
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	0.0	10	0.0	4.0

In [168]:

df.fillna("데이터없음")

Out[168]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	데이터없음
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	데이터없음	10	데이터없음	4.0

In [169]:

df.fillna(100)

Out[169]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	100.0
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	100.0	10	100.0	4.0

In [170]:

df

Out[170]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	NaN
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	NaN	10	NaN	4.0

In [ ]:

# 셔츠데이터의 비어있는 부분은 0 으로 채운다.

In [178]:

df["shirts"] = df["shirts"].fillna(0)

In [179]:

df

Out[179]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	NaN
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	0.0	10	NaN	4.0

In [172]:

# suits 와 glasses 의 비어있는 데이터는 100 으로 채운다

In [183]:

df[ ['suits','glasses'] ] = df[ ['suits','glasses'] ].fillna(100)

In [184]:

df

Out[184]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	100.0
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	0.0	10	100.0	4.0

In [ ]:

In [185]:

df = pd.DataFrame(data= items2 , index= ["store 1","store 2","store 3"])

In [186]:

df

Out[186]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	NaN
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	NaN	10	NaN	4.0

In [187]:

# 비어있는 데이터에
# 위 행의 데이터, 아래 행의 데이터로 채우는방법
# 왼쪽 열의 데이터, 오른쪽 열의 데이터로 채우는 방법

In [188]:

# 위행의 데이터,
df.fillna( method = 'ffill', axis=0)

Out[188]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	NaN
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	2.0	10	7.0	4.0

In [189]:

# 아래행의 데이터
df.fillna( method = 'bfill', axis=0)

Out[189]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	50.0
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	NaN	10	NaN	4.0

In [192]:

# 왼쪽열의 데이터
df.fillna(method= 'ffill' , axis=1)

Out[192]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20.0	30.0	35.0	15.0	8.0	45.0	45.0
store 2	15.0	5.0	10.0	2.0	5.0	7.0	50.0
store 3	20.0	30.0	35.0	35.0	10.0	10.0	4.0

In [193]:

# 오른쪽 열의 데이터
df.fillna(method = 'bfill', axis=1)

Out[193]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20.0	30.0	35.0	15.0	8.0	45.0	NaN
store 2	15.0	5.0	10.0	2.0	5.0	7.0	50.0
store 3	20.0	30.0	35.0	10.0	10.0	4.0	4.0

In [202]:

# 판다스 데이터프레임은 , 우리를 위해서 아주 편하게,
# 각 컬럼별로 알아서 계산해 준다.
df.mean()

Out[202]:

bikes      18.333333
pants      21.666667
watches    26.666667
shirts      8.500000
shoes       7.666667
suits      26.000000
glasses    27.000000
dtype: float64

In [203]:

df.max()

Out[203]:

bikes      20.0
pants      30.0
watches    35.0
shirts     15.0
shoes      10.0
suits      45.0
glasses    50.0
dtype: float64

In [204]:

df.min()

Out[204]:

bikes      15.0
pants       5.0
watches    10.0
shirts      2.0
shoes       5.0
suits       7.0
glasses     4.0
dtype: float64

In [205]:

df.std()

Out[205]:

bikes       2.886751
pants      14.433757
watches    14.433757
shirts      9.192388
shoes       2.516611
suits      26.870058
glasses    32.526912
dtype: float64

In [206]:

df.median()

Out[206]:

bikes      20.0
pants      30.0
watches    35.0
shirts      8.5
shoes       8.0
suits      26.0
glasses    27.0
dtype: float64

In [207]:

df.fillna(df.mean())

Out[207]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	27.0
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	8.5	10	26.0	4.0

In [196]:

df.describe()

Out[196]:

	bikes	pants	watches	shirts	shoes	suits	glasses
count	3.000000	3.000000	3.000000	2.000000	3.000000	2.000000	2.000000
mean	18.333333	21.666667	26.666667	8.500000	7.666667	26.000000	27.000000
std	2.886751	14.433757	14.433757	9.192388	2.516611	26.870058	32.526912
min	15.000000	5.000000	10.000000	2.000000	5.000000	7.000000	4.000000
25%	17.500000	17.500000	22.500000	5.250000	6.500000	16.500000	15.500000
50%	20.000000	30.000000	35.000000	8.500000	8.000000	26.000000	27.000000
75%	20.000000	30.000000	35.000000	11.750000	9.000000	35.500000	38.500000
max	20.000000	30.000000	35.000000	15.000000	10.000000	45.000000	50.000000

In [208]:

df

Out[208]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	20	30	35	15.0	8	45.0	NaN
store 2	15	5	10	2.0	5	7.0	50.0
store 3	20	30	35	NaN	10	NaN	4.0

In [210]:

# isna()의 반대

In [211]:

df.isna()

Out[211]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	False	False	False	False	False	False	True
store 2	False	False	False	False	False	False	False
store 3	False	False	False	True	False	True	False

In [209]:

df.notna()

Out[209]:

	bikes	pants	watches	shirts	shoes	suits	glasses
store 1	True	True	True	True	True	True	False
store 2	True	True	True	True	True	True	True
store 3	True	True	True	False	True	False	True

'DataScience > Pandas' 카테고리의 다른 글

Pandas 사용자 정의 함수사용 .apply(), 판다스내장.str라이브러리 (0)	2022.11.25
Pandas 카테고리컬, groupby(), 특정 데이터 가져오기 (0)	2022.11.24
Pandas CSV파일불러오기, .describe() 통계, .info()정보 (0)	2022.11.24
Pandas 행, 열 추가, 데이터 삭제 drop(), rename(), 인덱스 초기화 reset_index(inplace= True) (0)	2022.11.24
Pandas .iloc[ , ], 데이터 프레임에서 컬럼 만드는 방법 (0)	2022.11.24

Gemini & Ocean

Pandas NaN을 처리하는 전략 dropna(), fillna()

Dealing with NaN¶

'DataScience > Pandas' 카테고리의 다른 글

티스토리툴바

Pandas NaN을 처리하는 전략 dropna(), fillna()

Dealing with NaN¶

'DataScience > Pandas' 카테고리의 다른 글

관련글

티스토리툴바