Working with Time Series¶
Dates and Times in Python¶
Native Python dates and times: datetime
and dateutil
¶
In [1]:
from datetime import datetime
In [2]:
someday = datetime(2022, 5, 11, 15 , 30)
In [3]:
someday.isoformat()
Out[3]:
'2022-05-11T15:30:00'
In [4]:
someday.weekday()
Out[4]:
2
In [7]:
someday.strftime('%Y년 %m월 %d일')
Out[7]:
'2022년 05월 11일'
In [8]:
date_str = '2022-05-21'
In [9]:
from dateutil.parser import parse
In [10]:
parse(date_str)
Out[10]:
datetime.datetime(2022, 5, 21, 0, 0)
In [11]:
date_list = ['2022-05-21', '2022-06-23', '2022-07-21']
In [12]:
date_list
Out[12]:
['2022-05-21', '2022-06-23', '2022-07-21']
In [13]:
parse(date_list)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In [13], line 1 ----> 1 parse(date_list) File ~\Anaconda3\envs\YH\lib\site-packages\dateutil\parser\_parser.py:1368, in parse(timestr, parserinfo, **kwargs) 1366 return parser(parserinfo).parse(timestr, **kwargs) 1367 else: -> 1368 return DEFAULTPARSER.parse(timestr, **kwargs) File ~\Anaconda3\envs\YH\lib\site-packages\dateutil\parser\_parser.py:640, in parser.parse(self, timestr, default, ignoretz, tzinfos, **kwargs) 636 if default is None: 637 default = datetime.datetime.now().replace(hour=0, minute=0, 638 second=0, microsecond=0) --> 640 res, skipped_tokens = self._parse(timestr, **kwargs) 642 if res is None: 643 raise ParserError("Unknown string format: %s", timestr) File ~\Anaconda3\envs\YH\lib\site-packages\dateutil\parser\_parser.py:719, in parser._parse(self, timestr, dayfirst, yearfirst, fuzzy, fuzzy_with_tokens) 716 yearfirst = info.yearfirst 718 res = self._result() --> 719 l = _timelex.split(timestr) # Splits the timestr into tokens 721 skipped_idxs = [] 723 # year/month/day list File ~\Anaconda3\envs\YH\lib\site-packages\dateutil\parser\_parser.py:201, in _timelex.split(cls, s) 199 @classmethod 200 def split(cls, s): --> 201 return list(cls(s)) File ~\Anaconda3\envs\YH\lib\site-packages\dateutil\parser\_parser.py:69, in _timelex.__init__(self, instream) 67 instream = StringIO(instream) 68 elif getattr(instream, 'read', None) is None: ---> 69 raise TypeError('Parser must be a string or character stream, not ' 70 '{itype}'.format(itype=instream.__class__.__name__)) 72 self.instream = instream 73 self.charstack = [] TypeError: Parser must be a string or character stream, not list
In [ ]:
# 리스트로 처리할수 없기 때문에 아래 numpy 를 이용.
Typed arrays of times: NumPy's datetime64
¶
기존의 파이썬 datetime 을 보강하기 위해, date 의 array 도 처리할 수 있게 numpy 에서 64-bit 로 처리하도록 라이브러리를 강화했음.
In [15]:
import numpy as np
In [18]:
someday = np.array('2022-05-21', dtype = np.datetime64)
In [19]:
someday
Out[19]:
array('2022-05-21', dtype='datetime64[D]')
In [20]:
someday+10
Out[20]:
numpy.datetime64('2022-05-31')
In [21]:
someday-35
Out[21]:
numpy.datetime64('2022-04-16')
In [22]:
someday + np.arange(5)
Out[22]:
array(['2022-05-21', '2022-05-22', '2022-05-23', '2022-05-24', '2022-05-25'], dtype='datetime64[D]')
In [ ]:
Code | Meaning | Time span (relative) | Time span (absolute) |
---|---|---|---|
Y |
Year | ± 9.2e18 years | [9.2e18 BC, 9.2e18 AD] |
M |
Month | ± 7.6e17 years | [7.6e17 BC, 7.6e17 AD] |
W |
Week | ± 1.7e17 years | [1.7e17 BC, 1.7e17 AD] |
D |
Day | ± 2.5e16 years | [2.5e16 BC, 2.5e16 AD] |
h |
Hour | ± 1.0e15 years | [1.0e15 BC, 1.0e15 AD] |
m |
Minute | ± 1.7e13 years | [1.7e13 BC, 1.7e13 AD] |
s |
Second | ± 2.9e12 years | [ 2.9e9 BC, 2.9e9 AD] |
ms |
Millisecond | ± 2.9e9 years | [ 2.9e6 BC, 2.9e6 AD] |
us |
Microsecond | ± 2.9e6 years | [290301 BC, 294241 AD] |
ns |
Nanosecond | ± 292 years | [ 1678 AD, 2262 AD] |
ps |
Picosecond | ± 106 days | [ 1969 AD, 1970 AD] |
fs |
Femtosecond | ± 2.6 hours | [ 1969 AD, 1970 AD] |
as |
Attosecond | ± 9.2 seconds | [ 1969 AD, 1970 AD] |
Dates and times in pandas: best of both worlds¶
In [23]:
import pandas as pd
In [55]:
df = pd.read_csv("../data/GOOG.csv")
In [56]:
df = df.rename(columns={"Date":"날짜"})
In [57]:
df
Out[57]:
날짜 | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|
0 | 2004-08-19 | 49.676899 | 51.693783 | 47.669952 | 49.845802 | 49.845802 | 44994500 |
1 | 2004-08-20 | 50.178635 | 54.187561 | 49.925285 | 53.805050 | 53.805050 | 23005800 |
2 | 2004-08-23 | 55.017166 | 56.373344 | 54.172661 | 54.346527 | 54.346527 | 18393200 |
3 | 2004-08-24 | 55.260582 | 55.439419 | 51.450363 | 52.096165 | 52.096165 | 15361800 |
4 | 2004-08-25 | 52.140873 | 53.651051 | 51.604362 | 52.657513 | 52.657513 | 9257400 |
... | ... | ... | ... | ... | ... | ... | ... |
3308 | 2017-10-09 | 980.000000 | 985.424988 | 976.109985 | 977.000000 | 977.000000 | 891400 |
3309 | 2017-10-10 | 980.000000 | 981.570007 | 966.080017 | 972.599976 | 972.599976 | 968400 |
3310 | 2017-10-11 | 973.719971 | 990.710022 | 972.250000 | 989.250000 | 989.250000 | 1693300 |
3311 | 2017-10-12 | 987.450012 | 994.119995 | 985.000000 | 987.830017 | 987.830017 | 1262400 |
3312 | 2017-10-13 | 992.000000 | 997.210022 | 989.000000 | 989.679993 | 989.679993 | 1157700 |
3313 rows × 7 columns
In [58]:
df=df.drop(["Open","High","Low",'Close','Adj Close','Volume'], axis=1)
In [60]:
df
Out[60]:
날짜 | |
---|---|
0 | 2004-08-19 |
1 | 2004-08-20 |
2 | 2004-08-23 |
3 | 2004-08-24 |
4 | 2004-08-25 |
... | ... |
3308 | 2017-10-09 |
3309 | 2017-10-10 |
3310 | 2017-10-11 |
3311 | 2017-10-12 |
3312 | 2017-10-13 |
3313 rows × 1 columns
In [61]:
date_list.append("2022-01-04")
In [62]:
date_list.append("2022-01-07")
In [63]:
date_list
Out[63]:
['2022-01-04', '2022-01-07', '2022-05-21', '2022-06-23', '2022-07-21', '2022-01-04', '2022-01-04', '2022-01-07']
In [64]:
date_list.sort()
In [65]:
date_list
Out[65]:
['2022-01-04', '2022-01-04', '2022-01-04', '2022-01-07', '2022-01-07', '2022-05-21', '2022-06-23', '2022-07-21']
In [32]:
# 문자열로 되어있는 날짜의 리스트를!
# 한번에 파이썬의 날짜형식으로 변환하는 함수!
pd.to_datetime(date_list)
Out[32]:
DatetimeIndex(['2022-01-04', '2022-01-07', '2022-05-21', '2022-06-23', '2022-07-21'], dtype='datetime64[ns]', freq=None)
In [66]:
df["날짜"]
Out[66]:
0 2004-08-19 1 2004-08-20 2 2004-08-23 3 2004-08-24 4 2004-08-25 ... 3308 2017-10-09 3309 2017-10-10 3310 2017-10-11 3311 2017-10-12 3312 2017-10-13 Name: 날짜, Length: 3313, dtype: object
In [67]:
df["요일"] = pd.to_datetime( df["날짜"]).dt.weekday
In [68]:
df
Out[68]:
날짜 | 요일 | |
---|---|---|
0 | 2004-08-19 | 3 |
1 | 2004-08-20 | 4 |
2 | 2004-08-23 | 0 |
3 | 2004-08-24 | 1 |
4 | 2004-08-25 | 2 |
... | ... | ... |
3308 | 2017-10-09 | 0 |
3309 | 2017-10-10 | 1 |
3310 | 2017-10-11 | 2 |
3311 | 2017-10-12 | 3 |
3312 | 2017-10-13 | 4 |
3313 rows × 2 columns
In [69]:
df["day"] = pd.to_datetime(df["날짜"]).dt.strftime("%a")
In [70]:
df
Out[70]:
날짜 | 요일 | day | |
---|---|---|---|
0 | 2004-08-19 | 3 | Thu |
1 | 2004-08-20 | 4 | Fri |
2 | 2004-08-23 | 0 | Mon |
3 | 2004-08-24 | 1 | Tue |
4 | 2004-08-25 | 2 | Wed |
... | ... | ... | ... |
3308 | 2017-10-09 | 0 | Mon |
3309 | 2017-10-10 | 1 | Tue |
3310 | 2017-10-11 | 2 | Wed |
3311 | 2017-10-12 | 3 | Thu |
3312 | 2017-10-13 | 4 | Fri |
3313 rows × 3 columns
In [ ]:
Pandas Time Series: Indexing by Time¶
In [35]:
pd.DatetimeIndex(date_list)
Out[35]:
DatetimeIndex(['2022-01-04', '2022-01-07', '2022-05-21', '2022-06-23', '2022-07-21'], dtype='datetime64[ns]', freq=None)
In [36]:
pd.to_datetime(date_list)
Out[36]:
DatetimeIndex(['2022-01-04', '2022-01-07', '2022-05-21', '2022-06-23', '2022-07-21'], dtype='datetime64[ns]', freq=None)
In [ ]:
Regular sequences: pd.date_range()
¶
In [ ]:
# 시작날짜와 종료날짜를 셋팅하면,
# 알아서 날짜를 채워주는 함수
In [37]:
pd.date_range('2022-11-30',"2023-02-05")
Out[37]:
DatetimeIndex(['2022-11-30', '2022-12-01', '2022-12-02', '2022-12-03', '2022-12-04', '2022-12-05', '2022-12-06', '2022-12-07', '2022-12-08', '2022-12-09', '2022-12-10', '2022-12-11', '2022-12-12', '2022-12-13', '2022-12-14', '2022-12-15', '2022-12-16', '2022-12-17', '2022-12-18', '2022-12-19', '2022-12-20', '2022-12-21', '2022-12-22', '2022-12-23', '2022-12-24', '2022-12-25', '2022-12-26', '2022-12-27', '2022-12-28', '2022-12-29', '2022-12-30', '2022-12-31', '2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08', '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12', '2023-01-13', '2023-01-14', '2023-01-15', '2023-01-16', '2023-01-17', '2023-01-18', '2023-01-19', '2023-01-20', '2023-01-21', '2023-01-22', '2023-01-23', '2023-01-24', '2023-01-25', '2023-01-26', '2023-01-27', '2023-01-28', '2023-01-29', '2023-01-30', '2023-01-31', '2023-02-01', '2023-02-02', '2023-02-03', '2023-02-04', '2023-02-05'], dtype='datetime64[ns]', freq='D')
In [39]:
pd.date_range("2022-11-30 09:00","2023-02-05 11:00", freq= 'H')
Out[39]:
DatetimeIndex(['2022-11-30 09:00:00', '2022-11-30 10:00:00', '2022-11-30 11:00:00', '2022-11-30 12:00:00', '2022-11-30 13:00:00', '2022-11-30 14:00:00', '2022-11-30 15:00:00', '2022-11-30 16:00:00', '2022-11-30 17:00:00', '2022-11-30 18:00:00', ... '2023-02-05 02:00:00', '2023-02-05 03:00:00', '2023-02-05 04:00:00', '2023-02-05 05:00:00', '2023-02-05 06:00:00', '2023-02-05 07:00:00', '2023-02-05 08:00:00', '2023-02-05 09:00:00', '2023-02-05 10:00:00', '2023-02-05 11:00:00'], dtype='datetime64[ns]', length=1611, freq='H')
In [41]:
pd.date_range("2022-11-30","2023-02-05", freq= 'B')
Out[41]:
DatetimeIndex(['2022-11-30', '2022-12-01', '2022-12-02', '2022-12-05', '2022-12-06', '2022-12-07', '2022-12-08', '2022-12-09', '2022-12-12', '2022-12-13', '2022-12-14', '2022-12-15', '2022-12-16', '2022-12-19', '2022-12-20', '2022-12-21', '2022-12-22', '2022-12-23', '2022-12-26', '2022-12-27', '2022-12-28', '2022-12-29', '2022-12-30', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12', '2023-01-13', '2023-01-16', '2023-01-17', '2023-01-18', '2023-01-19', '2023-01-20', '2023-01-23', '2023-01-24', '2023-01-25', '2023-01-26', '2023-01-27', '2023-01-30', '2023-01-31', '2023-02-01', '2023-02-02', '2023-02-03'], dtype='datetime64[ns]', freq='B')
In [ ]:
Frequencies and Offsets¶
Code | Description | Code | Description |
---|---|---|---|
D |
Calendar day | B |
Business day |
W |
Weekly | ||
M |
Month end | BM |
Business month end |
Q |
Quarter end | BQ |
Business quarter end |
A |
Year end | BA |
Business year end |
H |
Hours | BH |
Business hours |
T |
Minutes | ||
S |
Seconds | ||
L |
Milliseonds | ||
U |
Microseconds | ||
N |
nanoseconds |
'DataScience > Pandas' 카테고리의 다른 글
Pandas 시카고 범죄율을 예측 Prophet, error_bad_lines, to_datetime(format), resample 함수의 사용법과, 이 함수를 사용하기 위해 인덱스를 설정하는 방법 (0) | 2023.01.03 |
---|---|
Pandas Prophet 라이브러리를 이용한 Time Series 데이터 예측 방법 (0) | 2023.01.02 |
Pandas Tip[1] 문자열 컬럼의 슬라이싱. str (0) | 2022.11.30 |
Pandas concat(), merge() 여러 데이터 프레임을 하나로 합치는 방법 (0) | 2022.11.25 |
Pandas 데이터프레임 오름차순, 내림차순 정렬 .Sort_values() ,sort_index() (0) | 2022.11.25 |