2022/01/08/SAT
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
temp= np.array([-10.2, -5.2, 0.1, 10.1, 12.2, 14.7,25.4, 26.8, 28.9, 35.1, 32.2, 34.6])
ϵ1= np.random.normal(size=12,scale=5)
icecream= 20 + temp * 2 + ϵ1
np.random.seed(2)
ϵ2= np.random.normal(size=12,scale=5)
disease = 30+ temp* 0.5 + ϵ2
plt.plot(temp,icecream,'.')
plt.plot(temp,disease,'.')
plt.plot(icecream,disease,'.') # 아이스크림 & 소아마비
plt.plot(icecream[:6],disease[:6],'.')
# 비슷한 온도로 관찰 -> 선형관계 약해짐
df=pd.read_csv('https://raw.githubusercontent.com/guebin/2021DV/master/_notebooks/extremum.csv')
df.columns
pd.Series(df.columns)
temp=np.array(df.iloc[:,3]) # 평균기온 열
len(temp) #평균기온에 해당되는 행 656개
np.random.seed(1)
ϵ1=np.random.normal(size=656, scale=10)
icecream=temp*2 + 30 + ϵ1
np.random.seed(2)
ϵ2=np.random.normal(size=656,scale=1)
disease=temp*0.5 + 40 +ϵ2
plt.plot(temp,icecream,'.')
plt.plot(temp,disease,'.')
plt.plot(icecream,disease,'.')
np.corrcoef(icecream,disease) #상관계수 -> 인과관계 의미 x
plt.plot(icecream[temp>25],disease[temp>25], '.') # 비슷한 온도구간 관찰 -> 선형관계 약해짐
fig , ((ax1,ax2), (ax3,ax4)) = plt.subplots(2,2)
ax1.plot(temp,icecream,'.')
ax2.plot(temp,disease,'.')
ax3.plot(icecream,disease,'.')
ax4.plot(icecream,disease,'.')
ax4.plot(icecream[temp>25],disease[temp>25],'.')
df1=pd.DataFrame({'temp':temp, 'icecream':icecream, 'disease':disease})
df1
df1.temp
df1.temp.hist() # = plt.hist(df1.temp)
def f(x):
if x<0: y='group0'
elif x<10: y='group10'
elif x<20: y='group20'
else: y='group30'
return y
df1['temp2']=list(map(f,df1.temp))
df1
from plotnine import *
ggplot(df1)+geom_point(aes(x='icecream',y='disease',color='temp2'))
ggplot(df1)+geom_point(aes(x='icecream',y='disease',color='temp2'))\
+geom_smooth(aes(x='icecream',y='disease',colour='temp2'),size=1,linetype='dashed')
ggplot(df1,aes(x='icecream',y='disease'))+geom_point(aes(color='temp2'))\
+geom_smooth(aes(color='temp2'),size=1,linetype='dashed')
만약 아이스크림과 질병이 연관있는 경우라면
np.random.seed(1)
ϵ1=np.random.normal(size=656, scale=10)
icecream=temp*2 + 30 + ϵ1
np.random.seed(2)
ϵ2=np.random.normal(size=656,scale=1)
disease= 30+ temp*0.0 + icecream*0.15 +ϵ2*2 # 식 자체 정의를 달리해야 함
df2=pd.DataFrame({'temp':temp,'icecream':icecream,'disease':disease})
df2['temp2']=list(map(f,df2.temp)) # map을 이용하여 기존df에 자료 추가해주려면 list로 !!
ggplot(df2)+geom_point(aes(x='icecream',y='disease',colour='temp2'))\
+geom_smooth(aes(x='icecream',y='disease',colour='temp2'),size=2,linetype='dashed')
df1.corr()
df2.corr()