실습¶
In [1]:
import pandas as pd
import numpy as np
# 책 제목과 작가, 그리고 유저별 별점 데이터가 있다.
books = pd.Series(data = ['Great Expectations', 'Of Mice and Men', 'Romeo and Juliet', 'The Time Machine', 'Alice in Wonderland' ])
authors = pd.Series(data = ['Charles Dickens', 'John Steinbeck', 'William Shakespeare', ' H. G. Wells', 'Lewis Carroll' ])
user_1 = pd.Series(data = [3.2, np.nan ,2.5])
user_2 = pd.Series(data = [5., 1.3, 4.0, 3.8])
user_3 = pd.Series(data = [2.0, 2.3, np.nan, 4])
user_4 = pd.Series(data = [4, 3.5, 4, 5, 4.2])
# np.nan values 는 해당 유저가 해당 책에는 아직 별점 주지 않은것이다.
# labels: 'Author', 'Book Title', 'User 1', 'User 2', 'User 3', 'User 4'.
# 아래 그림처럼 나오도록 만든다.
In [12]:
# 1. 딕셔너리를 만들고, 2. 데이터프레임으로 만든 후, 3. nan을 평균값으로 채운다.
user_1 = pd.Series(data = [3.2, np.nan ,2.5])
In [13]:
user_1
Out[13]:
0 3.2 1 NaN 2 2.5 dtype: float64
In [2]:
pd.DataFrame(data = books)
Out[2]:
0 | |
---|---|
0 | Great Expectations |
1 | Of Mice and Men |
2 | Romeo and Juliet |
3 | The Time Machine |
4 | Alice in Wonderland |
In [8]:
df = pd.DataFrame(data={'Book Title': books ,'Author' : authors, 'User 1' : user_1, 'User 2':user_2, 'User 3':user_3, 'User 4':user_4})
In [9]:
df
Out[9]:
Book Title | Author | User 1 | User 2 | User 3 | User 4 | |
---|---|---|---|---|---|---|
0 | Great Expectations | Charles Dickens | 3.2 | 5.0 | 2.0 | 4.0 |
1 | Of Mice and Men | John Steinbeck | NaN | 1.3 | 2.3 | 3.5 |
2 | Romeo and Juliet | William Shakespeare | 2.5 | 4.0 | NaN | 4.0 |
3 | The Time Machine | H. G. Wells | NaN | 3.8 | 4.0 | 5.0 |
4 | Alice in Wonderland | Lewis Carroll | NaN | NaN | NaN | 4.2 |
In [35]:
df.fillna(df.mean(numeric_only=True)) # numeric_only=True 을쓰지않으면 문자열이 섞여있다는 warning이 뜬다.[숫자데이터만 가져오라는 의미]
Out[35]:
Book Title | Author | User 1 | User 2 | User 3 | User 4 | |
---|---|---|---|---|---|---|
0 | Great Expectations | Charles Dickens | 3.20 | 5.000 | 2.000000 | 4.0 |
1 | Of Mice and Men | John Steinbeck | 2.85 | 1.300 | 2.300000 | 3.5 |
2 | Romeo and Juliet | William Shakespeare | 2.50 | 4.000 | 2.766667 | 4.0 |
3 | The Time Machine | H. G. Wells | 2.85 | 3.800 | 4.000000 | 5.0 |
4 | Alice in Wonderland | Lewis Carroll | 2.85 | 3.525 | 2.766667 | 4.2 |
In [15]:
# Author의 두번째 데이터를 가져와라.
df = df.fillna(df.mean())
C:\Users\5-10\AppData\Local\Temp\ipykernel_7280\2117203791.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. df = df.fillna(df.mean())
In [16]:
df
Out[16]:
Book Title | Author | User 1 | User 2 | User 3 | User 4 | |
---|---|---|---|---|---|---|
0 | Great Expectations | Charles Dickens | 3.20 | 5.000 | 2.000000 | 4.0 |
1 | Of Mice and Men | John Steinbeck | 2.85 | 1.300 | 2.300000 | 3.5 |
2 | Romeo and Juliet | William Shakespeare | 2.50 | 4.000 | 2.766667 | 4.0 |
3 | The Time Machine | H. G. Wells | 2.85 | 3.800 | 4.000000 | 5.0 |
4 | Alice in Wonderland | Lewis Carroll | 2.85 | 3.525 | 2.766667 | 4.2 |
In [17]:
df["Author"][1]
Out[17]:
'John Steinbeck'
In [31]:
df.iloc[ : , 1][1]
Out[31]:
'John Steinbeck'
In [33]:
df.loc[1,"Author"]
Out[33]:
'John Steinbeck'
Loading Data into a Pandas DataFrame¶
In [ ]:
# data 폴더안에 있는 GOOG.csv 파일을 읽어온다.
In [37]:
df = pd.read_csv("../data/GOOG.csv")
In [38]:
df
Out[38]:
Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|
0 | 2004-08-19 | 49.676899 | 51.693783 | 47.669952 | 49.845802 | 49.845802 | 44994500 |
1 | 2004-08-20 | 50.178635 | 54.187561 | 49.925285 | 53.805050 | 53.805050 | 23005800 |
2 | 2004-08-23 | 55.017166 | 56.373344 | 54.172661 | 54.346527 | 54.346527 | 18393200 |
3 | 2004-08-24 | 55.260582 | 55.439419 | 51.450363 | 52.096165 | 52.096165 | 15361800 |
4 | 2004-08-25 | 52.140873 | 53.651051 | 51.604362 | 52.657513 | 52.657513 | 9257400 |
... | ... | ... | ... | ... | ... | ... | ... |
3308 | 2017-10-09 | 980.000000 | 985.424988 | 976.109985 | 977.000000 | 977.000000 | 891400 |
3309 | 2017-10-10 | 980.000000 | 981.570007 | 966.080017 | 972.599976 | 972.599976 | 968400 |
3310 | 2017-10-11 | 973.719971 | 990.710022 | 972.250000 | 989.250000 | 989.250000 | 1693300 |
3311 | 2017-10-12 | 987.450012 | 994.119995 | 985.000000 | 987.830017 | 987.830017 | 1262400 |
3312 | 2017-10-13 | 992.000000 | 997.210022 | 989.000000 | 989.679993 | 989.679993 | 1157700 |
3313 rows × 7 columns
In [39]:
df.head()
Out[39]:
Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|
0 | 2004-08-19 | 49.676899 | 51.693783 | 47.669952 | 49.845802 | 49.845802 | 44994500 |
1 | 2004-08-20 | 50.178635 | 54.187561 | 49.925285 | 53.805050 | 53.805050 | 23005800 |
2 | 2004-08-23 | 55.017166 | 56.373344 | 54.172661 | 54.346527 | 54.346527 | 18393200 |
3 | 2004-08-24 | 55.260582 | 55.439419 | 51.450363 | 52.096165 | 52.096165 | 15361800 |
4 | 2004-08-25 | 52.140873 | 53.651051 | 51.604362 | 52.657513 | 52.657513 | 9257400 |
In [40]:
df.tail()
Out[40]:
Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|
3308 | 2017-10-09 | 980.000000 | 985.424988 | 976.109985 | 977.000000 | 977.000000 | 891400 |
3309 | 2017-10-10 | 980.000000 | 981.570007 | 966.080017 | 972.599976 | 972.599976 | 968400 |
3310 | 2017-10-11 | 973.719971 | 990.710022 | 972.250000 | 989.250000 | 989.250000 | 1693300 |
3311 | 2017-10-12 | 987.450012 | 994.119995 | 985.000000 | 987.830017 | 987.830017 | 1262400 |
3312 | 2017-10-13 | 992.000000 | 997.210022 | 989.000000 | 989.679993 | 989.679993 | 1157700 |
In [41]:
df.shape
Out[41]:
(3313, 7)
In [43]:
df.describe()
Out[43]:
Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|
count | 3313.000000 | 3313.000000 | 3313.000000 | 3313.000000 | 3313.000000 | 3.313000e+03 |
mean | 380.186092 | 383.493740 | 376.519309 | 380.072458 | 380.072458 | 8.038476e+06 |
std | 223.818650 | 224.974534 | 222.473232 | 223.853780 | 223.853780 | 8.399521e+06 |
min | 49.274517 | 50.541279 | 47.669952 | 49.681866 | 49.681866 | 7.900000e+03 |
25% | 226.556473 | 228.394516 | 224.003082 | 226.407440 | 226.407440 | 2.584900e+06 |
50% | 293.312286 | 295.433502 | 289.929291 | 293.029114 | 293.029114 | 5.281300e+06 |
75% | 536.650024 | 540.000000 | 532.409973 | 536.690002 | 536.690002 | 1.065370e+07 |
max | 992.000000 | 997.210022 | 989.000000 | 989.679993 | 989.679993 | 8.276810e+07 |
In [46]:
# e+03 => 10의 3승
# e+06 => 10의 6승
# e-04 => 10의 -4승
In [47]:
3.313 * 10**3
Out[47]:
3313.0
In [45]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3313 entries, 0 to 3312 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 3313 non-null object 1 Open 3313 non-null float64 2 High 3313 non-null float64 3 Low 3313 non-null float64 4 Close 3313 non-null float64 5 Adj Close 3313 non-null float64 6 Volume 3313 non-null int64 dtypes: float64(5), int64(1), object(1) memory usage: 181.3+ KB
In [49]:
df.isna().sum()
Out[49]:
Date 0 Open 0 High 0 Low 0 Close 0 Adj Close 0 Volume 0 dtype: int64
In [50]:
# 각 컬럼별 평균을 구하세요.
In [52]:
df.mean(numeric_only=True)
Out[52]:
Open 3.801861e+02 High 3.834937e+02 Low 3.765193e+02 Close 3.800725e+02 Adj Close 3.800725e+02 Volume 8.038476e+06 dtype: float64
In [53]:
df
Out[53]:
Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|
0 | 2004-08-19 | 49.676899 | 51.693783 | 47.669952 | 49.845802 | 49.845802 | 44994500 |
1 | 2004-08-20 | 50.178635 | 54.187561 | 49.925285 | 53.805050 | 53.805050 | 23005800 |
2 | 2004-08-23 | 55.017166 | 56.373344 | 54.172661 | 54.346527 | 54.346527 | 18393200 |
3 | 2004-08-24 | 55.260582 | 55.439419 | 51.450363 | 52.096165 | 52.096165 | 15361800 |
4 | 2004-08-25 | 52.140873 | 53.651051 | 51.604362 | 52.657513 | 52.657513 | 9257400 |
... | ... | ... | ... | ... | ... | ... | ... |
3308 | 2017-10-09 | 980.000000 | 985.424988 | 976.109985 | 977.000000 | 977.000000 | 891400 |
3309 | 2017-10-10 | 980.000000 | 981.570007 | 966.080017 | 972.599976 | 972.599976 | 968400 |
3310 | 2017-10-11 | 973.719971 | 990.710022 | 972.250000 | 989.250000 | 989.250000 | 1693300 |
3311 | 2017-10-12 | 987.450012 | 994.119995 | 985.000000 | 987.830017 | 987.830017 | 1262400 |
3312 | 2017-10-13 | 992.000000 | 997.210022 | 989.000000 | 989.679993 | 989.679993 | 1157700 |
3313 rows × 7 columns
In [54]:
# 상한가(High)의 평균을 구하세요
In [55]:
df["High"].meann()
Out[55]:
383.4937395517656
In [56]:
# 종가(Close)의 최대값은?
In [57]:
df["Close"].max()
Out[57]:
989.679993
In [ ]:
In [64]:
df = pd.read_csv("../data/fake_company.csv")
In [65]:
df
Out[65]:
Year | Name | Department | Age | Salary | |
---|---|---|---|---|---|
0 | 1990 | Alice | HR | 25 | 50000 |
1 | 1990 | Bob | RD | 30 | 48000 |
2 | 1990 | Charlie | Admin | 45 | 55000 |
3 | 1991 | Alice | HR | 26 | 52000 |
4 | 1991 | Bob | RD | 31 | 50000 |
5 | 1991 | Charlie | Admin | 46 | 60000 |
6 | 1992 | Alice | HR | 27 | 60000 |
7 | 1992 | Bob | RD | 32 | 52000 |
8 | 1992 | Charlie | Admin | 47 | 62000 |
In [72]:
# 카테고리컬 데이터(Categorical Data) 분류가 가능한, 즉 중복이 많은것.
In [73]:
df.shape
Out[73]:
(9, 5)
In [71]:
df["Year"].unique()
Out[71]:
array([1990, 1991, 1992], dtype=int64)
In [74]:
df["Year"].nunique()
Out[74]:
3
In [76]:
df["Department"].unique()
Out[76]:
array(['HR', 'RD', 'Admin'], dtype=object)
In [77]:
df["Department"].nunique()
Out[77]:
3
In [ ]:
In [78]:
df
Out[78]:
Year | Name | Department | Age | Salary | |
---|---|---|---|---|---|
0 | 1990 | Alice | HR | 25 | 50000 |
1 | 1990 | Bob | RD | 30 | 48000 |
2 | 1990 | Charlie | Admin | 45 | 55000 |
3 | 1991 | Alice | HR | 26 | 52000 |
4 | 1991 | Bob | RD | 31 | 50000 |
5 | 1991 | Charlie | Admin | 46 | 60000 |
6 | 1992 | Alice | HR | 27 | 60000 |
7 | 1992 | Bob | RD | 32 | 52000 |
8 | 1992 | Charlie | Admin | 47 | 62000 |
In [79]:
df.head(2)
Out[79]:
Year | Name | Department | Age | Salary | |
---|---|---|---|---|---|
0 | 1990 | Alice | HR | 25 | 50000 |
1 | 1990 | Bob | RD | 30 | 48000 |
In [66]:
# 데이터프레임 전체의 통계치를 확인 => 숫자 데이터만
df.describe()
Out[66]:
Year | Age | Salary | |
---|---|---|---|
count | 9.000000 | 9.000000 | 9.000000 |
mean | 1991.000000 | 34.333333 | 54333.333333 |
std | 0.866025 | 9.055385 | 5147.815070 |
min | 1990.000000 | 25.000000 | 48000.000000 |
25% | 1990.000000 | 27.000000 | 50000.000000 |
50% | 1991.000000 | 31.000000 | 52000.000000 |
75% | 1992.000000 | 45.000000 | 60000.000000 |
max | 1992.000000 | 47.000000 | 62000.000000 |
In [67]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9 entries, 0 to 8 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 9 non-null int64 1 Name 9 non-null object 2 Department 9 non-null object 3 Age 9 non-null int64 4 Salary 9 non-null int64 dtypes: int64(3), object(2) memory usage: 488.0+ bytes
In [80]:
# 문자열 컬럼에 describe() 함수 사용
# Name 컬럼만 해보자
In [81]:
df["Name"].describe() # top : 제일많이겹치는 단어, freq : 겹치는개수
Out[81]:
count 9 unique 3 top Alice freq 3 Name: Name, dtype: object
In [82]:
df["Department"].describe()
Out[82]:
count 9 unique 3 top HR freq 3 Name: Department, dtype: object
In [ ]:
# 카테고리컬 데이터의 경우엔,
# 데이터 분석시,
# 데이터별로 묶어서 데이터를 분석할 수 있다.
In [83]:
# 각 년도별로, 지급한 연봉 총 합을 구하라.
In [90]:
df.groupby("Year")["Salary"].sum()
Out[90]:
Year 1990 153000 1991 162000 1992 174000 Name: Salary, dtype: int64
In [91]:
# 각 직원별로, 얼마씩 받았는지 연봉 평균을 구하세요.
In [92]:
df.groupby("Name")["Salary"].mean()
Out[92]:
Name Alice 54000.0 Bob 50000.0 Charlie 59000.0 Name: Salary, dtype: float64
In [93]:
# 년도별로, 연봉의 총합, 평균, 표준편차를 보여주세요
In [96]:
import numpy as np
In [97]:
df.groupby("Year")["Salary"].agg( [ np.sum , np.mean , np.std ])
Out[97]:
sum | mean | std | |
---|---|---|---|
Year | |||
1990 | 153000 | 51000.0 | 3605.551275 |
1991 | 162000 | 54000.0 | 5291.502622 |
1992 | 174000 | 58000.0 | 5291.502622 |
In [98]:
# Name 컬럼은, 각 이름별로 몇개의 데이터가 있나?
In [102]:
df.groupby("Name")["Name"].count()
Out[102]:
Name Alice 3 Bob 3 Charlie 3 Name: Name, dtype: int64
In [ ]:
# Name 컬럼은, 각 이름별로 몇개의 데이터가 있나?
In [105]:
df["Name"].value_counts() # 많이쓰는 함수
Out[105]:
Alice 3 Bob 3 Charlie 3 Name: Name, dtype: int64
PANDAS OPERATIONS¶
In [106]:
df = pd.DataFrame({'Employee ID':[111, 222, 333, 444],
'Employee Name':['Chanel', 'Steve', 'Mitch', 'Bird'],
'Salary [$/h]':[35, 29, 38, 20],
'Years of Experience':[3, 4 ,9, 1]})
df
Out[106]:
Employee ID | Employee Name | Salary [$/h] | Years of Experience | |
---|---|---|---|---|
0 | 111 | Chanel | 35 | 3 |
1 | 222 | Steve | 29 | 4 |
2 | 333 | Mitch | 38 | 9 |
3 | 444 | Bird | 20 | 1 |
In [108]:
# 경력이 3년 이상인 사람의 데이터만 가져오시오
In [112]:
df["Years of Experience"]
Out[112]:
0 3 1 4 2 9 3 1 Name: Years of Experience, dtype: int64
In [113]:
df["Years of Experience"] >= 3
Out[113]:
0 True 1 True 2 True 3 False Name: Years of Experience, dtype: bool
In [115]:
df.loc[ df["Years of Experience"] >= 3 , ] ## 중요
Out[115]:
Employee ID | Employee Name | Salary [$/h] | Years of Experience | |
---|---|---|---|---|
0 | 111 | Chanel | 35 | 3 |
1 | 222 | Steve | 29 | 4 |
2 | 333 | Mitch | 38 | 9 |
In [117]:
df.iloc[ df["Years of Experience"] >= 3 , ] ## 에러
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) Cell In [117], line 1 ----> 1 df.iloc[ df["Years of Experience"] >= 3 , ] File ~\Anaconda3\envs\YH\lib\site-packages\pandas\core\indexing.py:961, in _LocationIndexer.__getitem__(self, key) 959 if self._is_scalar_access(key): 960 return self.obj._get_value(*key, takeable=self._takeable) --> 961 return self._getitem_tuple(key) 962 else: 963 # we by definition only have the 0th axis 964 axis = self.axis or 0 File ~\Anaconda3\envs\YH\lib\site-packages\pandas\core\indexing.py:1461, in _iLocIndexer._getitem_tuple(self, tup) 1459 def _getitem_tuple(self, tup: tuple): -> 1461 tup = self._validate_tuple_indexer(tup) 1462 with suppress(IndexingError): 1463 return self._getitem_lowerdim(tup) File ~\Anaconda3\envs\YH\lib\site-packages\pandas\core\indexing.py:769, in _LocationIndexer._validate_tuple_indexer(self, key) 767 for i, k in enumerate(key): 768 try: --> 769 self._validate_key(k, i) 770 except ValueError as err: 771 raise ValueError( 772 "Location based indexing can only have " 773 f"[{self._valid_types}] types" 774 ) from err File ~\Anaconda3\envs\YH\lib\site-packages\pandas\core\indexing.py:1350, in _iLocIndexer._validate_key(self, key, axis) 1348 if hasattr(key, "index") and isinstance(key.index, Index): 1349 if key.index.inferred_type == "integer": -> 1350 raise NotImplementedError( 1351 "iLocation based boolean " 1352 "indexing on an integer type " 1353 "is not available" 1354 ) 1355 raise ValueError( 1356 "iLocation based boolean indexing cannot use " 1357 "an indexable as a mask" 1358 ) 1359 return NotImplementedError: iLocation based boolean indexing on an integer type is not available
In [118]:
df
Out[118]:
Employee ID | Employee Name | Salary [$/h] | Years of Experience | |
---|---|---|---|---|
0 | 111 | Chanel | 35 | 3 |
1 | 222 | Steve | 29 | 4 |
2 | 333 | Mitch | 38 | 9 |
3 | 444 | Bird | 20 | 1 |
In [ ]:
# 경력이 3년이상인 사람의 이름과 시급정보를 가져오시오
In [119]:
df.loc[ df["Years of Experience"] >= 3 , ["Employee Name", 'Salary [$/h]' ]]
Out[119]:
Employee Name | Salary [$/h] | |
---|---|---|
0 | Chanel | 35 |
1 | Steve | 29 |
2 | Mitch | 38 |
In [ ]:
In [120]:
# 경력이 4년이상 8년 이하인 사람의 데이터를 가져오시오
In [128]:
df["Years of Experience"] >= 4
Out[128]:
0 False 1 True 2 True 3 False Name: Years of Experience, dtype: bool
In [129]:
df["Years of Experience"] <= 8
Out[129]:
0 True 1 True 2 False 3 True Name: Years of Experience, dtype: bool
In [133]:
df.loc[ (df["Years of Experience"] >= 4) & (df["Years of Experience"] <= 8), "Employee Name": ]
Out[133]:
Employee Name | Salary [$/h] | Years of Experience | |
---|---|---|---|
1 | Steve | 29 | 4 |
In [134]:
# 시급이 가장 높은 사람의 데이터를 가져오시오
In [135]:
df
Out[135]:
Employee ID | Employee Name | Salary [$/h] | Years of Experience | |
---|---|---|---|---|
0 | 111 | Chanel | 35 | 3 |
1 | 222 | Steve | 29 | 4 |
2 | 333 | Mitch | 38 | 9 |
3 | 444 | Bird | 20 | 1 |
In [140]:
df["Salary [$/h]"].max()
Out[140]:
38
In [141]:
df["Salary [$/h]"] == df["Salary [$/h]"].max()
Out[141]:
0 False 1 False 2 True 3 False Name: Salary [$/h], dtype: bool
In [142]:
df.loc[ df["Salary [$/h]"] == df["Salary [$/h]"].max() , ]
Out[142]:
Employee ID | Employee Name | Salary [$/h] | Years of Experience | |
---|---|---|---|---|
2 | 333 | Mitch | 38 | 9 |
'DataScience > Pandas' 카테고리의 다른 글
Pandas 데이터프레임 오름차순, 내림차순 정렬 .Sort_values() ,sort_index() (0) | 2022.11.25 |
---|---|
Pandas 사용자 정의 함수사용 .apply(), 판다스내장.str라이브러리 (0) | 2022.11.25 |
Pandas NaN을 처리하는 전략 dropna(), fillna() (0) | 2022.11.24 |
Pandas CSV파일불러오기, .describe() 통계, .info()정보 (0) | 2022.11.24 |
Pandas 행, 열 추가, 데이터 삭제 drop(), rename(), 인덱스 초기화 reset_index(inplace= True) (0) | 2022.11.24 |