실습¶

In [1]:

import pandas as pd
import numpy as np


# 책 제목과 작가, 그리고 유저별 별점 데이터가 있다.

books = pd.Series(data = ['Great Expectations', 'Of Mice and Men', 'Romeo and Juliet', 'The Time Machine', 'Alice in Wonderland' ])
authors = pd.Series(data = ['Charles Dickens', 'John Steinbeck', 'William Shakespeare', ' H. G. Wells', 'Lewis Carroll' ])

user_1 = pd.Series(data = [3.2, np.nan ,2.5])
user_2 = pd.Series(data = [5., 1.3, 4.0, 3.8])
user_3 = pd.Series(data = [2.0, 2.3, np.nan, 4])
user_4 = pd.Series(data = [4, 3.5, 4, 5, 4.2])

#  np.nan values 는 해당 유저가 해당 책에는 아직 별점 주지 않은것이다.
# labels: 'Author', 'Book Title', 'User 1', 'User 2', 'User 3', 'User 4'. 
# 아래 그림처럼 나오도록 만든다.

%E1%84%89%E1%85%B3%E1%84%8F%E1%85%B3%E1%84%85%E1%85%B5%E1%86%AB%E1%84%89%E1%85%A3%E1%86%BA%202019-12-27%20%E1%84%8B%E1%85%A9%E1%84%92%E1%85%AE%206.28.49.png

In [12]:

# 1. 딕셔너리를 만들고,    2. 데이터프레임으로 만든 후,    3. nan을  평균값으로 채운다.

user_1 = pd.Series(data = [3.2, np.nan ,2.5])

In [13]:

user_1

Out[13]:

0    3.2
1    NaN
2    2.5
dtype: float64

In [2]:

pd.DataFrame(data = books)

Out[2]:

	0
0	Great Expectations
1	Of Mice and Men
2	Romeo and Juliet
3	The Time Machine
4	Alice in Wonderland

In [8]:

df = pd.DataFrame(data={'Book Title': books ,'Author' : authors, 'User 1' : user_1, 'User 2':user_2, 'User 3':user_3, 'User 4':user_4})

In [9]:

df

Out[9]:

	Book Title	Author	User 1	User 2	User 3	User 4
0	Great Expectations	Charles Dickens	3.2	5.0	2.0	4.0
1	Of Mice and Men	John Steinbeck	NaN	1.3	2.3	3.5
2	Romeo and Juliet	William Shakespeare	2.5	4.0	NaN	4.0
3	The Time Machine	H. G. Wells	NaN	3.8	4.0	5.0
4	Alice in Wonderland	Lewis Carroll	NaN	NaN	NaN	4.2

In [35]:

df.fillna(df.mean(numeric_only=True))  # numeric_only=True 을쓰지않으면 문자열이 섞여있다는 warning이 뜬다.[숫자데이터만 가져오라는 의미]

Out[35]:

	Book Title	Author	User 1	User 2	User 3	User 4
0	Great Expectations	Charles Dickens	3.20	5.000	2.000000	4.0
1	Of Mice and Men	John Steinbeck	2.85	1.300	2.300000	3.5
2	Romeo and Juliet	William Shakespeare	2.50	4.000	2.766667	4.0
3	The Time Machine	H. G. Wells	2.85	3.800	4.000000	5.0
4	Alice in Wonderland	Lewis Carroll	2.85	3.525	2.766667	4.2

In [15]:

# Author의 두번째 데이터를 가져와라.
df = df.fillna(df.mean())

C:\Users\5-10\AppData\Local\Temp\ipykernel_7280\2117203791.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df = df.fillna(df.mean())

In [16]:

df

Out[16]:

	Book Title	Author	User 1	User 2	User 3	User 4
0	Great Expectations	Charles Dickens	3.20	5.000	2.000000	4.0
1	Of Mice and Men	John Steinbeck	2.85	1.300	2.300000	3.5
2	Romeo and Juliet	William Shakespeare	2.50	4.000	2.766667	4.0
3	The Time Machine	H. G. Wells	2.85	3.800	4.000000	5.0
4	Alice in Wonderland	Lewis Carroll	2.85	3.525	2.766667	4.2

In [17]:

df["Author"][1]

Out[17]:

'John Steinbeck'

In [31]:

df.iloc[ : , 1][1]

Out[31]:

'John Steinbeck'

In [33]:

df.loc[1,"Author"]

Out[33]:

'John Steinbeck'

Loading Data into a Pandas DataFrame¶

In [ ]:

# data 폴더안에 있는 GOOG.csv 파일을 읽어온다.

In [37]:

df = pd.read_csv("../data/GOOG.csv")

In [38]:

df

Out[38]:

	Date	Open	High	Low	Close	Adj Close	Volume
0	2004-08-19	49.676899	51.693783	47.669952	49.845802	49.845802	44994500
1	2004-08-20	50.178635	54.187561	49.925285	53.805050	53.805050	23005800
2	2004-08-23	55.017166	56.373344	54.172661	54.346527	54.346527	18393200
3	2004-08-24	55.260582	55.439419	51.450363	52.096165	52.096165	15361800
4	2004-08-25	52.140873	53.651051	51.604362	52.657513	52.657513	9257400
...	...	...	...	...	...	...	...
3308	2017-10-09	980.000000	985.424988	976.109985	977.000000	977.000000	891400
3309	2017-10-10	980.000000	981.570007	966.080017	972.599976	972.599976	968400
3310	2017-10-11	973.719971	990.710022	972.250000	989.250000	989.250000	1693300
3311	2017-10-12	987.450012	994.119995	985.000000	987.830017	987.830017	1262400
3312	2017-10-13	992.000000	997.210022	989.000000	989.679993	989.679993	1157700

3313 rows × 7 columns

In [39]:

df.head()

Out[39]:

	Date	Open	High	Low	Close	Adj Close	Volume
0	2004-08-19	49.676899	51.693783	47.669952	49.845802	49.845802	44994500
1	2004-08-20	50.178635	54.187561	49.925285	53.805050	53.805050	23005800
2	2004-08-23	55.017166	56.373344	54.172661	54.346527	54.346527	18393200
3	2004-08-24	55.260582	55.439419	51.450363	52.096165	52.096165	15361800
4	2004-08-25	52.140873	53.651051	51.604362	52.657513	52.657513	9257400

In [40]:

df.tail()

Out[40]:

	Date	Open	High	Low	Close	Adj Close	Volume
3308	2017-10-09	980.000000	985.424988	976.109985	977.000000	977.000000	891400
3309	2017-10-10	980.000000	981.570007	966.080017	972.599976	972.599976	968400
3310	2017-10-11	973.719971	990.710022	972.250000	989.250000	989.250000	1693300
3311	2017-10-12	987.450012	994.119995	985.000000	987.830017	987.830017	1262400
3312	2017-10-13	992.000000	997.210022	989.000000	989.679993	989.679993	1157700

In [41]:

df.shape

Out[41]:

(3313, 7)

In [43]:

df.describe()

Out[43]:

	Open	High	Low	Close	Adj Close	Volume
count	3313.000000	3313.000000	3313.000000	3313.000000	3313.000000	3.313000e+03
mean	380.186092	383.493740	376.519309	380.072458	380.072458	8.038476e+06
std	223.818650	224.974534	222.473232	223.853780	223.853780	8.399521e+06
min	49.274517	50.541279	47.669952	49.681866	49.681866	7.900000e+03
25%	226.556473	228.394516	224.003082	226.407440	226.407440	2.584900e+06
50%	293.312286	295.433502	289.929291	293.029114	293.029114	5.281300e+06
75%	536.650024	540.000000	532.409973	536.690002	536.690002	1.065370e+07
max	992.000000	997.210022	989.000000	989.679993	989.679993	8.276810e+07

In [46]:

# e+03 => 10의 3승 
# e+06 => 10의 6승
# e-04 => 10의 -4승

In [47]:

3.313 * 10**3

Out[47]:

3313.0

In [45]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3313 entries, 0 to 3312
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       3313 non-null   object 
 1   Open       3313 non-null   float64
 2   High       3313 non-null   float64
 3   Low        3313 non-null   float64
 4   Close      3313 non-null   float64
 5   Adj Close  3313 non-null   float64
 6   Volume     3313 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 181.3+ KB

In [49]:

df.isna().sum()

Out[49]:

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [50]:

# 각 컬럼별 평균을 구하세요.

In [52]:

df.mean(numeric_only=True)

Out[52]:

Open         3.801861e+02
High         3.834937e+02
Low          3.765193e+02
Close        3.800725e+02
Adj Close    3.800725e+02
Volume       8.038476e+06
dtype: float64

In [53]:

df

Out[53]:

	Date	Open	High	Low	Close	Adj Close	Volume
0	2004-08-19	49.676899	51.693783	47.669952	49.845802	49.845802	44994500
1	2004-08-20	50.178635	54.187561	49.925285	53.805050	53.805050	23005800
2	2004-08-23	55.017166	56.373344	54.172661	54.346527	54.346527	18393200
3	2004-08-24	55.260582	55.439419	51.450363	52.096165	52.096165	15361800
4	2004-08-25	52.140873	53.651051	51.604362	52.657513	52.657513	9257400
...	...	...	...	...	...	...	...
3308	2017-10-09	980.000000	985.424988	976.109985	977.000000	977.000000	891400
3309	2017-10-10	980.000000	981.570007	966.080017	972.599976	972.599976	968400
3310	2017-10-11	973.719971	990.710022	972.250000	989.250000	989.250000	1693300
3311	2017-10-12	987.450012	994.119995	985.000000	987.830017	987.830017	1262400
3312	2017-10-13	992.000000	997.210022	989.000000	989.679993	989.679993	1157700

3313 rows × 7 columns

In [54]:

# 상한가(High)의 평균을 구하세요

In [55]:

df["High"].meann()

Out[55]:

383.4937395517656

In [56]:

# 종가(Close)의 최대값은?

In [57]:

df["Close"].max()

Out[57]:

989.679993

In [ ]:

In [64]:

df = pd.read_csv("../data/fake_company.csv")

In [65]:

df

Out[65]:

	Year	Name	Department	Age	Salary
0	1990	Alice	HR	25	50000
1	1990	Bob	RD	30	48000
2	1990	Charlie	Admin	45	55000
3	1991	Alice	HR	26	52000
4	1991	Bob	RD	31	50000
5	1991	Charlie	Admin	46	60000
6	1992	Alice	HR	27	60000
7	1992	Bob	RD	32	52000
8	1992	Charlie	Admin	47	62000

In [72]:

# 카테고리컬 데이터(Categorical Data) 분류가 가능한, 즉 중복이 많은것.

In [73]:

df.shape

Out[73]:

(9, 5)

In [71]:

df["Year"].unique()

Out[71]:

array([1990, 1991, 1992], dtype=int64)

In [74]:

df["Year"].nunique()

Out[74]:

In [76]:

df["Department"].unique()

Out[76]:

array(['HR', 'RD', 'Admin'], dtype=object)

In [77]:

df["Department"].nunique()

Out[77]:

In [ ]:

In [78]:

df

Out[78]:

	Year	Name	Department	Age	Salary
0	1990	Alice	HR	25	50000
1	1990	Bob	RD	30	48000
2	1990	Charlie	Admin	45	55000
3	1991	Alice	HR	26	52000
4	1991	Bob	RD	31	50000
5	1991	Charlie	Admin	46	60000
6	1992	Alice	HR	27	60000
7	1992	Bob	RD	32	52000
8	1992	Charlie	Admin	47	62000

In [79]:

df.head(2)

Out[79]:

	Year	Name	Department	Age	Salary
0	1990	Alice	HR	25	50000
1	1990	Bob	RD	30	48000

In [66]:

# 데이터프레임 전체의 통계치를 확인 => 숫자 데이터만
df.describe()

Out[66]:

	Year	Age	Salary
count	9.000000	9.000000	9.000000
mean	1991.000000	34.333333	54333.333333
std	0.866025	9.055385	5147.815070
min	1990.000000	25.000000	48000.000000
25%	1990.000000	27.000000	50000.000000
50%	1991.000000	31.000000	52000.000000
75%	1992.000000	45.000000	60000.000000
max	1992.000000	47.000000	62000.000000

In [67]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        9 non-null      int64 
 1   Name        9 non-null      object
 2   Department  9 non-null      object
 3   Age         9 non-null      int64 
 4   Salary      9 non-null      int64 
dtypes: int64(3), object(2)
memory usage: 488.0+ bytes

In [80]:

# 문자열 컬럼에 describe() 함수 사용 
# Name 컬럼만 해보자

In [81]:

df["Name"].describe()  # top : 제일많이겹치는 단어, freq : 겹치는개수

Out[81]:

count         9
unique        3
top       Alice
freq          3
Name: Name, dtype: object

In [82]:

df["Department"].describe()

Out[82]:

count      9
unique     3
top       HR
freq       3
Name: Department, dtype: object

In [ ]:

# 카테고리컬 데이터의 경우엔,
# 데이터 분석시,
# 데이터별로 묶어서 데이터를 분석할 수 있다.

In [83]:

# 각 년도별로, 지급한 연봉 총 합을 구하라.

In [90]:

df.groupby("Year")["Salary"].sum()

Out[90]:

Year
1990    153000
1991    162000
1992    174000
Name: Salary, dtype: int64

In [91]:

# 각 직원별로, 얼마씩 받았는지 연봉 평균을 구하세요.

In [92]:

df.groupby("Name")["Salary"].mean()

Out[92]:

Name
Alice      54000.0
Bob        50000.0
Charlie    59000.0
Name: Salary, dtype: float64

In [93]:

# 년도별로, 연봉의 총합, 평균, 표준편차를 보여주세요

In [96]:

import numpy as np

In [97]:

df.groupby("Year")["Salary"].agg( [ np.sum , np.mean , np.std ])

Out[97]:

	sum	mean	std
Year
1990	153000	51000.0	3605.551275
1991	162000	54000.0	5291.502622
1992	174000	58000.0	5291.502622

In [98]:

# Name 컬럼은, 각 이름별로 몇개의 데이터가 있나?

In [102]:

df.groupby("Name")["Name"].count()

Out[102]:

Name
Alice      3
Bob        3
Charlie    3
Name: Name, dtype: int64

In [ ]:

# Name 컬럼은, 각 이름별로 몇개의 데이터가 있나?

In [105]:

df["Name"].value_counts()    # 많이쓰는 함수

Out[105]:

Alice      3
Bob        3
Charlie    3
Name: Name, dtype: int64

PANDAS OPERATIONS¶

In [106]:

df = pd.DataFrame({'Employee ID':[111, 222, 333, 444],
                   'Employee Name':['Chanel', 'Steve', 'Mitch', 'Bird'],
                   'Salary [$/h]':[35, 29, 38, 20],
                   'Years of Experience':[3, 4 ,9, 1]})
df

Out[106]:

	Employee ID	Employee Name	Salary [$/h]	Years of Experience
0	111	Chanel	35	3
1	222	Steve	29	4
2	333	Mitch	38	9
3	444	Bird	20	1

In [108]:

# 경력이 3년 이상인 사람의 데이터만 가져오시오

In [112]:

df["Years of Experience"]

Out[112]:

0    3
1    4
2    9
3    1
Name: Years of Experience, dtype: int64

In [113]:

df["Years of Experience"] >= 3

Out[113]:

0     True
1     True
2     True
3    False
Name: Years of Experience, dtype: bool

In [115]:

df.loc[ df["Years of Experience"] >= 3 , ] ## 중요

Out[115]:

	Employee ID	Employee Name	Salary [$/h]	Years of Experience
0	111	Chanel	35	3
1	222	Steve	29	4
2	333	Mitch	38	9

In [117]:

df.iloc[ df["Years of Experience"] >= 3 , ] ## 에러

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
Cell In [117], line 1
----> 1 df.iloc[ df["Years of Experience"] >= 3 , ]

File ~\Anaconda3\envs\YH\lib\site-packages\pandas\core\indexing.py:961, in _LocationIndexer.__getitem__(self, key)
    959     if self._is_scalar_access(key):
    960         return self.obj._get_value(*key, takeable=self._takeable)
--> 961     return self._getitem_tuple(key)
    962 else:
    963     # we by definition only have the 0th axis
    964     axis = self.axis or 0

File ~\Anaconda3\envs\YH\lib\site-packages\pandas\core\indexing.py:1461, in _iLocIndexer._getitem_tuple(self, tup)
   1459 def _getitem_tuple(self, tup: tuple):
-> 1461     tup = self._validate_tuple_indexer(tup)
   1462     with suppress(IndexingError):
   1463         return self._getitem_lowerdim(tup)

File ~\Anaconda3\envs\YH\lib\site-packages\pandas\core\indexing.py:769, in _LocationIndexer._validate_tuple_indexer(self, key)
    767 for i, k in enumerate(key):
    768     try:
--> 769         self._validate_key(k, i)
    770     except ValueError as err:
    771         raise ValueError(
    772             "Location based indexing can only have "
    773             f"[{self._valid_types}] types"
    774         ) from err

File ~\Anaconda3\envs\YH\lib\site-packages\pandas\core\indexing.py:1350, in _iLocIndexer._validate_key(self, key, axis)
   1348 if hasattr(key, "index") and isinstance(key.index, Index):
   1349     if key.index.inferred_type == "integer":
-> 1350         raise NotImplementedError(
   1351             "iLocation based boolean "
   1352             "indexing on an integer type "
   1353             "is not available"
   1354         )
   1355     raise ValueError(
   1356         "iLocation based boolean indexing cannot use "
   1357         "an indexable as a mask"
   1358     )
   1359 return

NotImplementedError: iLocation based boolean indexing on an integer type is not available

In [118]:

df

Out[118]:

	Employee ID	Employee Name	Salary [$/h]	Years of Experience
0	111	Chanel	35	3
1	222	Steve	29	4
2	333	Mitch	38	9
3	444	Bird	20	1

In [ ]:

# 경력이 3년이상인 사람의 이름과 시급정보를 가져오시오

In [119]:

df.loc[ df["Years of Experience"] >= 3 , ["Employee Name", 'Salary [$/h]' ]]

Out[119]:

	Employee Name	Salary [$/h]
0	Chanel	35
1	Steve	29
2	Mitch	38

In [ ]:

In [120]:

# 경력이 4년이상 8년 이하인 사람의 데이터를 가져오시오

In [128]:

df["Years of Experience"] >= 4

Out[128]:

0    False
1     True
2     True
3    False
Name: Years of Experience, dtype: bool

In [129]:

df["Years of Experience"] <= 8

Out[129]:

0     True
1     True
2    False
3     True
Name: Years of Experience, dtype: bool

In [133]:

df.loc[ (df["Years of Experience"] >= 4) & (df["Years of Experience"] <= 8), "Employee Name": ]

Out[133]:

	Employee Name	Salary [$/h]	Years of Experience
1	Steve	29	4

In [134]:

# 시급이 가장 높은 사람의 데이터를 가져오시오

In [135]:

df

Out[135]:

	Employee ID	Employee Name	Salary [$/h]	Years of Experience
0	111	Chanel	35	3
1	222	Steve	29	4
2	333	Mitch	38	9
3	444	Bird	20	1

In [140]:

df["Salary [$/h]"].max() 

Out[140]:

In [141]:

df["Salary [$/h]"] == df["Salary [$/h]"].max() 

Out[141]:

0    False
1    False
2     True
3    False
Name: Salary [$/h], dtype: bool

In [142]:

df.loc[ df["Salary [$/h]"] == df["Salary [$/h]"].max()  , ]

Out[142]:

	Employee ID	Employee Name	Salary [$/h]	Years of Experience
2	333	Mitch	38	9

Pandas 데이터프레임 오름차순, 내림차순 정렬 .Sort_values() ,sort_index() (0)	2022.11.25
Pandas 사용자 정의 함수사용 .apply(), 판다스내장.str라이브러리 (0)	2022.11.25
Pandas NaN을 처리하는 전략 dropna(), fillna() (0)	2022.11.24
Pandas CSV파일불러오기, .describe() 통계, .info()정보 (0)	2022.11.24
Pandas 행, 열 추가, 데이터 삭제 drop(), rename(), 인덱스 초기화 reset_index(inplace= True) (0)	2022.11.24

블로그안함

Pandas 카테고리컬, groupby(), 특정 데이터 가져오기

실습¶

Loading Data into a Pandas DataFrame¶

PANDAS OPERATIONS¶

'DataScience > Pandas' 카테고리의 다른 글

티스토리툴바

Pandas 카테고리컬, groupby(), 특정 데이터 가져오기

실습¶

Loading Data into a Pandas DataFrame¶

PANDAS OPERATIONS¶

'DataScience > Pandas' 카테고리의 다른 글

관련글

티스토리툴바