2022/01/12/WED
import pandas as pd
import numpy as np
from plotnine import *
import matplotlib.pyplot as plt
url = 'https://raw.githubusercontent.com/PacktPublishing/Pandas-Cookbook/master/data/state_fruit.csv'
df=pd.read_csv(url)
df
df=pd.read_csv(url,index_col=0)
df
df.stack() # 인덱스 기준으로 각 열 값 정리
df.stack().reset_index().rename(columns={'level_0':'group1','level_1':'group2',0:'X'})
df2=pd.read_csv(url)
df2.rename(columns={'Unnamed: 0':'group1'})
df2.rename(columns={'Unnamed: 0':'group1'}).melt(id_vars='group1')
df2.rename(columns={'Unnamed: 0':'group1'}).melt(id_vars='group1')\
.rename(columns={'variable':'group2','value':'X'})
df=pd.read_csv(url,index_col=0)
df
df.reset_index()
df.reset_index().melt(id_vars='index')\
.rename(columns={'index':'group1','variable':'group2','value':'X'})
df2=pd.read_csv(url)
df2
df2.set_index('Unnamed: 0')
df2.set_index('Unnamed: 0').stack()
df2.set_index('Unnamed: 0').stack().reset_index()\
.rename(columns={'Unnamed: 0':'group1','level_1':'group2',0:'X'})
g=['A']*100+['B']*200 # df에 넣어줄 땐 list
y=list(np.random.randn(100)*2+2)+list(np.random.randn(200)+3) # df에 넣어줄 땐 list
df=pd.DataFrame({'g':g,'y':y})
df
ggplot(df)+geom_bar(aes(x='g',fill='g')) # 디폴트로 Y축은 A와 B의 개수 카운트로 수행 -> 100,200개
df.groupby(by='g').count()
df.groupby(by='g').count().reset_index()
fig=ggplot(df.groupby(by='g').count().reset_index())
fig+geom_bar(aes(x='g',y='y',fill='g'),stat='identity')
# stat='identity' (값을 그대로 사용하라는 파라미터)
ggplot(df)+geom_bar(aes(x='g',fill='g'),stat='count')
# stat='count' => 굳이 작성을 안 해줘도 동일한 결과를 얻을 수 있음, 디폴트로 count가 실행되기 때문
td=df.groupby(by='g').count().reset_index()
td
plt.bar(td.g,td.y)
td.plot(kind='bar',x='g',y='y')
df.groupby('g').agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})
df.groupby('g')\
.agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})\
.rename(columns={'<lambda_0>':'range'}).stack() # index기준으로 자료들을 stack한다고 생각하면 됨
df.groupby('g')\
.agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})\
.rename(columns={'<lambda_0>':'range'}).stack().reset_index()
td=df.groupby('g')\
.agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})\
.rename(columns={'<lambda_0>':'range'}).stack().reset_index()
td
ggplot(td)+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity')
ggplot(td)+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity',position='dodge')
ggplot(td)\
+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()
ggplot(td)\
+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()+facet_wrap('level_1')
td
ggplot(td)\
+geom_bar(aes(x='g',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()
ggplot(td)\
+geom_bar(aes(x='g',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()+facet_wrap('level_1')
td
ggplot(td)+facet_grid('level_1~g')\
+geom_bar(aes(x='g',y='y',fill='g'),stat='identity')+coord_flip()
DEP=(['A1']*2+['A2']*2+['B1']*2+['B2']*2)*2
GEN=['M']*8+['F']*8
STATE=['PASS','FAIL']*8
COUNT=[1,9,2,8,80,20,85,15,5,5,5,5,9,1,9,1]
df=pd.DataFrame({'DEP':DEP,'STATE':STATE,'GEN':GEN,'COUNT':COUNT})
df
df.groupby(['GEN','STATE']).agg({'COUNT':np.sum})
df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).index
- index가 tuple로 MultiIndex로 들어가있음
df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()
df.groupby(['GEN']).agg({'COUNT':np.sum}).reset_index().rename(columns={'COUNT':'SUM'})
_df1=df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()
_df2=df.groupby(['GEN']).agg({'COUNT':np.sum}).reset_index().rename(columns={'COUNT':'SUM'})
방법 1
def f(x):
if x=='F':
return 40
if x=='M':
return 220
_df1['SUM']=list(map(f,_df1.GEN))
_df1
- 방법 2
_df1=df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()
_df1
_df2
def f(x):
return lambda x: _df2.query('GEN == @x').SUM.item()
_df1['SUM']=list(map(f(_df1.GEN),_df1.GEN))
_df1
- 방법 3
_df1=df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()
pd.merge(_df1,_df2)
_df1.merge(_df2)
_df2.merge(_df1)
td=_df2.merge(_df1)
td
td['PROP']=td.COUNT/td.SUM
td
ggplot(td.query('STATE=="PASS"'))+geom_bar(aes(x='GEN',y='PROP',fill='GEN'),stat='identity')
df
td=df.groupby(['DEP','GEN']).agg({'COUNT':sum}).reset_index()\
.rename(columns={'COUNT':'SUM'}).merge(df)
td
td['PROP']=td.COUNT/td.SUM
td
ggplot(td.query('STATE=="PASS"'))\
+geom_bar(aes(x='GEN',y='PROP',fill='GEN'),stat='identity')\
+facet_wrap('DEP')