인문대생의 데이터 직군 취업로그 32

류지윤·2023년 7월 18일
0

[ 함수(def)의 기초 ]

def test_def(a, b):
    return a + b
c = test_def(2, 3)
c
  • 가장 기초적인 모양의 def 정의
  • 이름(test_df)과 입력 인자(a, b)를 정해준다
  • 출력(return)을 작성
  • 전역변수(global)
    a = 1 

def edit_a(i):

# 지역변수(local)
global a 
a = I 

edit_a(2)
a


- global 변수를 def 내에서 사용하고 싶다면 global로 선언

def edit_a(i):
a = I
edit_a(5)
print(a)


- def 내에서의 변수와 밖에서의 변수는 같은 이름이어도 같은 것이 아니다
- y = asin(2\pi ft + t_0) + b

import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
def plotSinWave(amp, freq, endTime, sampleTime, startTime, bias):
"""
plot sine wave
y = a sin(2 pi f t + t_0) + b
"""
time = np.arange(startTime, endTime, sampleTime)
result = amp np.sin(2 np.pi freq time + startTime) + bias

plt.figure(figsize=(12, 6))
plt.plot(time, result)
plt.grid(True)
plt.xlabel("time")
plt.ylabel("sin")
plt.title(str(amp) + "*sin(2*pi" + str(freq) + "*t+" + str(startTime) + ")+" + str(bias))
plt.show()

plotSinWave(2, 1, 10, 0.01, 0.5, 0)
def plotSinWave(**kwargs):
"""
plot sine wave
y = a sin(2 pi f t + t_0) + b
"""
endTime = kwargs.get("endTime", 1)
sampleTime = kwargs.get("sampleTime", 0.01)
amp = kwargs.get("amp", 1)
freq = kwargs.get("freq", 1)
startTime = kwargs.get("startTime", 0)
bias = kwargs.get("bias", 0)
figsize = kwargs.get("figsize", (12, 6))

time = np.arange(startTime, endTime, sampleTime)
result = amp * np.sin(2 * np.pi * freq * time + startTime) + bias 

plt.figure(figsize=(12, 6))
plt.plot(time, result)
plt.grid(True)
plt.xlabel("time")
plt.ylabel("sin")
plt.title(str(amp) + "*sin(2*pi" + str(freq) + "*t+" + str(startTime) + ")+" + str(bias))
plt.show()

plotSinWave()
plotSinWave(amp=2, freq=0.5, endTime=10)

[ 내가 만든 함수 import ]
- drawSinWave.py
%%writefile ./drawSinWave.py

import numpy as np
import matplotlib.pyplot as plt

def plotSinWave(**kwargs):
"""
plot sine wave
y = a sin(2 pi f t + t_0) + b
"""
endTime = kwargs.get("endTime", 1)
sampleTime = kwargs.get("sampleTime", 0.01)
amp = kwargs.get("amp", 1)
freq = kwargs.get("freq", 1)
startTime = kwargs.get("startTime", 0)
bias = kwargs.get("bias", 0)
figsize = kwargs.get("figsize", (12, 6))

time = np.arange(startTime, endTime, sampleTime)
result = amp * np.sin(2 * np.pi * freq * time + startTime) + bias 
plt.figure(figsize=(12, 6))
plt.plot(time, result)
plt.grid(True)
plt.xlabel("time")
plt.ylabel("sin")
plt.title(str(amp) + "*sin(2*pi" + str(freq) + "*t+" + str(startTime) + ")+" + str(bias))
plt.show()

if name == "main":
print("hello world~!!")
print("this is test graph!!")
plotSinWave(amp=1, endTime=2)

  • Overwriting ./drawSinWave.py

import drawSinWave as dS
dS.plotSinWave()
dS.plotSinWave(freq=5)

[ 그래프 한글 설정 ]
%%writefile ./set_matplotlib_hangul.py

import platform
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

path = "c:/Windows/Fonts/malgun.ttf"

if platform.system() == "Darwin":
print("Hangul OK in your MAC!!!")
rc("font", family="Arial Unicode MS")
elif platform.system() == "Windows":
font_name = font_manager.FontProperties(fname=path).get_name()
print("Hangul OK in your Windows!!!")
rc("font", family=font_name)
else:
print("Unknown system.. sorry~~~")

plt.rcParams["axes.unicode_minus"] = False

  • Overwriting ./set_matplotlib_hangul.py
    import set_matplotlib_hangul
    plt.title("한글")

[ 2. Fbprophet 기초 ]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

time = np.linspace(0, 1, 3652)
result = np.sin(2
np.pi12time)
ds = pd.date_range("2018-01-01", periods=365*2, freq="D")
df = pd.DataFrame({"ds": ds, "y": result})
df.head()

df["y"].plot(figsize=(10, 6));

from fbprophet import Prophet

m = Prophet(yearly_seasonality=True, daily_seasonality=True)
m.fit(df);

future = m.make_future_dataframe(periods=30)
forecast = m.predict(future)

m.plot(forecast);

2
time = np.linspace(0, 1, 3652)
result = np.sin(2
np.pi12time) + time

ds = pd.date_range("2018-01-01", periods=365*2, freq="D")
df = pd.DataFrame({"ds": ds, "y": result})

df["y"].plot(figsize=(10, 6));

m = Prophet(yearly_seasonality=True, daily_seasonality=True)
m.fit(df)
future = m.make_future_dataframe(periods=30)
forecast = m.predict(future)
m.plot(forecast);

3
time = np.linspace(0, 1, 3652)
result = np.sin(2
np.pi12time) + time + np.random.randn(365*2)/4

ds = pd.date_range("2018-01-01", periods=365*2, freq="D")
df = pd.DataFrame({"ds": ds, "y": result})

df["y"].plot(figsize=(10, 6));
m = Prophet(yearly_seasonality=True, daily_seasonality=True)
m.fit(df)
future = m.make_future_dataframe(periods=30)
forecast = m.predict(future)
m.plot(forecast);

import pandas as pd
import pandas_datareader as web
import numpy as np
import matplotlib.pyplot as plt

from fbprophet import Prophet
from datetime import datetime

%matplotlib inline

pinkwink_web = pd.read_csv(
"../data/05_PinkWink_Web_Traffic.csv",
encoding="utf-8",
thousands=",",
names=["date", "hit"],
index_col=0
)

pinkwink_web = pinkwink_web[pinkwink_web["hit"].notnull()]
pinkwink_web.head()



* 전체 데이터 그려보기 

pinkwink_web["hit"].plot(figsize=(12, 4), grid=True);

trend 분석을 시각화하기 위한 x축 값을 만들기

time = np.arange(0, len(pinkwink_web))
traffic = pinkwink_web["hit"].values
fx = np.linspace(0, time[-1], 1000)

에러를 계산할 함수

def error(f, x, y):
return np.sqrt(np.mean((f(x) - y) ** 2))

fp1 = np.polyfit(time, traffic, 1)
f1 = np.poly1d(fp1)

f2p = np.polyfit(time, traffic, 2)
f2 = np.poly1d(f2p)

f3p = np.polyfit(time, traffic, 3)
f3 = np.poly1d(f3p)

f15p = np.polyfit(time, traffic, 15)
f15 = np.poly1d(f15p)

print(error(f1, time, traffic))
print(error(f2, time, traffic))
print(error(f3, time, traffic))
print(error(f15, time, traffic))

plt.figure(figsize=(12, 4))
plt.scatter(time, traffic, s=10)
plt.plot(fx, f1(fx), lw=4, label='f1')
plt.plot(fx, f2(fx), lw=4, label='f2')
plt.plot(fx, f3(fx), lw=4, label='f3')
plt.plot(fx, f15(fx), lw=4, label='f15')

plt.grid(True, linestyle="-", color="0.75")
plt.legend(loc=2)
plt.show()

df = pd.DataFrame({"ds": pinkwink_web.index, "y": pinkwink_web["hit"]})
df.reset_index(inplace=True)
df["ds"] = pd.to_datetime(df["ds"], format="%y. %m. %d.")
del df["date"]
df.head()

m = Prophet(yearly_seasonality=True, daily_seasonality=True)
m.fit(df);

60일에 해당하는 데이터 예측

future = m.make_future_dataframe(periods=60)
future.tail()

예측 결과는 상한/하한의 범위를 포함해서 얻어진다

forecast = m.predict(future)
forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]].tail()

m.plot(forecast);

m.plot_components(forecast);

[ 4. 주식 데이터 fbprophet 으로 분석하기 ]
  1. yahoo finance

url = "https://finance.yahoo.com/quote/035420.KS/history?p=035420.KS&guccounter=1"
req = Request(url, headers={"User-Agent": "Chrome"})
page = urlopen(req).read()
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table")
df_raw = pd.read_html(str(table))[0]
df_raw.head()

fbprophet을 사용하는 형식에 맞춰준 뒤, 맨 마지막 NaN 값이 있어서 제외

df_tmp = pd.DataFrame({"ds": df_raw["Date"], "y": df_raw["Close*"]})
df_target = df_tmp[:-1]
df_target.head()

harcopy 후, 날짜를 fbprophet이 요구하는 형태로 변형

df = df_target.copy()
df["ds"] = pd.to_datetime(df_target["ds"], format="%b %d, %Y")
df.head()

df.info()

데이터형 변환 object => float

df["y"] = df["y"].astype("float")
df.info()

m = Prophet(yearly_seasonality=True, daily_seasonality=True)
m.fit(df);

future = m.make_future_dataframe(periods=30)
forecast = m.predict(future)
forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]].tail()

plt.figure(figsize=(12, 6))
plt.plot(df["ds"], df["y"], label="real")
plt.grid(True)
plt.show()

m.plot_components(forecast);

  1. KIA
    !pip install yfinance

기아 자동차의 종목코드를 가지고 기간을 입력한다

import yfinance as yf
from pandas_datareader import data

yf.pdr_override()

start_date = "2010-03-01"
end_date = "2018-02-28"
KIA = data.get_data_yahoo("000270.KS", start_date, end_date)

KIA.head()

KIA["Close"].plot(figsize=(12, 6), grid=True);

accuracy 확인을 위한 데이터

KIA_trunc = KIA[:"2017-11-30"]
KIA_trunc.head()

forecast를 위한 준비

df = pd.DataFrame({"ds": KIA_trunc.index, "y":KIA_trunc["Close"]})
df.reset_index(inplace=True)
del df["Date"]
df.head()

m = Prophet(yearly_seasonality=True, daily_seasonality=True)
m.fit(df);

future = m.make_future_dataframe(periods=90)
forecast = m.predict(future)
forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]].tail()

m.plot(forecast);
m.plot_components(forecast);
KIA["Close"]
plt.figure(figsize=(12, 6))
plt.plot(KIA.index, KIA["Close"], label="real")
plt.plot(forecast["ds"], forecast["yhat"], label="forecast")
plt.grid(True)
plt.legend()
plt.show()

  1. 대한항공
    003490 대한항공

start_date = "2010-03-01"
end_date = "2018-02-28"
KoreaAir = data.get_data_yahoo("003490.KS", start_date, end_date)
KoreaAir.tail()
KoreaAir["Close"].plot(figsize=(12, 6), grid=True);

accuracy 데이터 분리

KoreaAir_trunc = KoreaAir[:"2017-11-30"]
KoreaAir_trunc.tail()

forecast를 위한 준비

df = pd.DataFrame({"ds": KoreaAir_trunc.index, "y": KoreaAir_trunc["Close"]})
df.reset_index(inplace=True)
del df["Date"]
df.head()
m = Prophet(yearly_seasonality=True, daily_seasonality=True)
m.fit(df)
future = m.make_future_dataframe(periods=90)
forecast = m.predict(future)
forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]].tail()
m.plot(forecast);
m.plot_components(forecast);

실제와 예측값 비교 그래프

plt.figure(figsize=(12, 6))
plt.plot(KoreaAir.index, KoreaAir["Close"], label="real")
plt.plot(forecast["ds"], forecast["yhat"], label="forecast")
plt.grid(True)
plt.legend()
plt.show()

  1. 조금 특이한 형태의 데이터에 대한 forecast

Logistic 성장형 그래프를 가진 데이터에 대한 forecast

df = pd.read_csv("../data/05_example_wp_R2.csv", index_col=0)
df["y"].plot(figsize=(12, 4), grid=True);

df["cap"] = 8.5
df.tail()

m = Prophet(growth="logistic", daily_seasonality=True)
m.fit(df);

future = m.make_future_dataframe(periods=1826)
future["cap"] = 8.5
forecast = m.predict(future)
m.plot(forecast);

5. 비트코인 데이터 fbprophet으로 분석하기
- https://bitcoincharts.com/charts/bitstampUSD#rg60ztgSzm1g10```
zm2g25zv
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import time 

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from fbprophet import Prophet

%matplotlib inline 

url = "https://bitcoincharts.com/charts/bitstampUSD#rg730ztgSzm1g10zm2g25zv"
driver = webdriver.Chrome("../driver/chromedriver")
driver.get(url)

#스크롤 
xpath = '//*[@id="content_chart"]/div/div[2]/a'
variable = driver.find_element_by_xpath(xpath)
driver.execute_script("return arguments[0].scrollIntoView();", variable)
variable.click()


html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", "data")
table

driver.quit()

df = pd.read_html(str(table))
bitcoin = df[0]
bitcoin.head()

!rm -rf "../data/05_bitcoin_history.csv"
bitcoin.to_csv("../data/05_bitcoin_history.csv", sep=",")

bitcoin = pd.read_csv("../data/05_bitcoin_history.csv", index_col=0)
bitcoin.tail()

#분석하고 싶은 항목(Close)만 가지고, Prophet 적용 

df = pd.DataFrame({"ds": bitcoin["Timestamp"], "y": bitcoin["Close"]})
m = Prophet(yearly_seasonality=True, daily_seasonality=True)
m.fit(df);

#향후 30일간의 forecast
future = m.make_future_dataframe(periods=30)
forecast = m.predict(future)
m.plot(forecast);

#트렌드 
m.plot_components(forecast);
  • 미니 프로젝트 시작 중 웹크롤링 다음 가장 어려웠다.
  • 눈으로 1번, 무작정 따라하기 1번, 개인연습 1번씩 하기로 다짐 :)

1개의 댓글

comment-user-thumbnail
2023년 7월 18일

정말 유익한 글이었습니다.

답글 달기