tidy data

import pandas as pd 
import numpy as np 
from plotnine import * 
import matplotlib.pyplot as plt 
url = 'https://raw.githubusercontent.com/PacktPublishing/Pandas-Cookbook/master/data/state_fruit.csv'
df=pd.read_csv(url)
df
Unnamed: 0 Apple Orange Banana
0 Texas 12 10 40
1 Arizona 9 7 12
2 Florida 0 14 190
df=pd.read_csv(url,index_col=0)
df
Apple Orange Banana
Texas 12 10 40
Arizona 9 7 12
Florida 0 14 190
df.stack() # 인덱스 기준으로 각 열 값 정리
Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64
df.stack().reset_index().rename(columns={'level_0':'group1','level_1':'group2',0:'X'})
group1 group2 X
0 Texas Apple 12
1 Texas Orange 10
2 Texas Banana 40
3 Arizona Apple 9
4 Arizona Orange 7
5 Arizona Banana 12
6 Florida Apple 0
7 Florida Orange 14
8 Florida Banana 190
df2=pd.read_csv(url)
df2.rename(columns={'Unnamed: 0':'group1'})
group1 Apple Orange Banana
0 Texas 12 10 40
1 Arizona 9 7 12
2 Florida 0 14 190
df2.rename(columns={'Unnamed: 0':'group1'}).melt(id_vars='group1')
group1 variable value
0 Texas Apple 12
1 Arizona Apple 9
2 Florida Apple 0
3 Texas Orange 10
4 Arizona Orange 7
5 Florida Orange 14
6 Texas Banana 40
7 Arizona Banana 12
8 Florida Banana 190
df2.rename(columns={'Unnamed: 0':'group1'}).melt(id_vars='group1')\
.rename(columns={'variable':'group2','value':'X'})
group1 group2 X
0 Texas Apple 12
1 Arizona Apple 9
2 Florida Apple 0
3 Texas Orange 10
4 Arizona Orange 7
5 Florida Orange 14
6 Texas Banana 40
7 Arizona Banana 12
8 Florida Banana 190
df=pd.read_csv(url,index_col=0)
df
Apple Orange Banana
Texas 12 10 40
Arizona 9 7 12
Florida 0 14 190
df.reset_index()
index Apple Orange Banana
0 Texas 12 10 40
1 Arizona 9 7 12
2 Florida 0 14 190
df.reset_index().melt(id_vars='index')\
.rename(columns={'index':'group1','variable':'group2','value':'X'})
group1 group2 X
0 Texas Apple 12
1 Arizona Apple 9
2 Florida Apple 0
3 Texas Orange 10
4 Arizona Orange 7
5 Florida Orange 14
6 Texas Banana 40
7 Arizona Banana 12
8 Florida Banana 190
df2=pd.read_csv(url)
df2
Unnamed: 0 Apple Orange Banana
0 Texas 12 10 40
1 Arizona 9 7 12
2 Florida 0 14 190
df2.set_index('Unnamed: 0')
Apple Orange Banana
Unnamed: 0
Texas 12 10 40
Arizona 9 7 12
Florida 0 14 190
df2.set_index('Unnamed: 0').stack()
Unnamed: 0        
Texas       Apple      12
            Orange     10
            Banana     40
Arizona     Apple       9
            Orange      7
            Banana     12
Florida     Apple       0
            Orange     14
            Banana    190
dtype: int64
df2.set_index('Unnamed: 0').stack().reset_index()\
.rename(columns={'Unnamed: 0':'group1','level_1':'group2',0:'X'})
group1 group2 X
0 Texas Apple 12
1 Texas Orange 10
2 Texas Banana 40
3 Arizona Apple 9
4 Arizona Orange 7
5 Arizona Banana 12
6 Florida Apple 0
7 Florida Orange 14
8 Florida Banana 190

Barplot + 해들리위컴의 그래프레이어

g=['A']*100+['B']*200 # df에 넣어줄 땐 list 
y=list(np.random.randn(100)*2+2)+list(np.random.randn(200)+3) # df에 넣어줄 땐 list
df=pd.DataFrame({'g':g,'y':y})
df
g y
0 A 1.052926
1 A 0.918170
2 A 4.508794
3 A 3.343357
4 A 5.346823
... ... ...
295 B 4.618109
296 B 3.721053
297 B 3.649687
298 B 0.368584
299 B 2.664331

300 rows × 2 columns

ggplot(df)+geom_bar(aes(x='g',fill='g')) # 디폴트로 Y축은 A와 B의 개수 카운트로 수행 -> 100,200개
<ggplot: (144141037442)>
df.groupby(by='g').count()
y
g
A 100
B 200
df.groupby(by='g').count().reset_index()
g y
0 A 100
1 B 200
fig=ggplot(df.groupby(by='g').count().reset_index())
fig+geom_bar(aes(x='g',y='y',fill='g'),stat='identity')
# stat='identity' (값을 그대로 사용하라는 파라미터)
<ggplot: (144140819628)>
ggplot(df)+geom_bar(aes(x='g',fill='g'),stat='count') 
# stat='count' => 굳이 작성을 안 해줘도 동일한 결과를 얻을 수 있음, 디폴트로 count가 실행되기 때문
<ggplot: (144140709386)>

td=df.groupby(by='g').count().reset_index()
td
g y
0 A 100
1 B 200
plt.bar(td.g,td.y)
<BarContainer object of 2 artists>
td.plot(kind='bar',x='g',y='y')
<AxesSubplot:xlabel='g'>
df.groupby('g').agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})
y
mean median std <lambda_0>
g
A 2.057996 1.996399 2.070265 11.788106
B 2.912674 2.855547 0.987115 5.194664
df.groupby('g')\
.agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})\
.rename(columns={'<lambda_0>':'range'}).stack() # index기준으로 자료들을 stack한다고 생각하면 됨
y
g
A mean 2.057996
median 1.996399
range 11.788106
std 2.070265
B mean 2.912674
median 2.855547
range 5.194664
std 0.987115
df.groupby('g')\
.agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})\
.rename(columns={'<lambda_0>':'range'}).stack().reset_index()
g level_1 y
0 A mean 2.057996
1 A median 1.996399
2 A range 11.788106
3 A std 2.070265
4 B mean 2.912674
5 B median 2.855547
6 B range 5.194664
7 B std 0.987115
td=df.groupby('g')\
.agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})\
.rename(columns={'<lambda_0>':'range'}).stack().reset_index()
td
g level_1 y
0 A mean 2.057996
1 A median 1.996399
2 A range 11.788106
3 A std 2.070265
4 B mean 2.912674
5 B median 2.855547
6 B range 5.194664
7 B std 0.987115
ggplot(td)+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity')
<ggplot: (144144674551)>
ggplot(td)+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity',position='dodge')
<ggplot: (144144787038)>
ggplot(td)\
+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()
<ggplot: (144144704871)>
ggplot(td)\
+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()+facet_wrap('level_1')
<ggplot: (144144737437)>
td
g level_1 y
0 A mean 2.057996
1 A median 1.996399
2 A range 11.788106
3 A std 2.070265
4 B mean 2.912674
5 B median 2.855547
6 B range 5.194664
7 B std 0.987115
ggplot(td)\
+geom_bar(aes(x='g',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()
<ggplot: (144144676343)>
ggplot(td)\
+geom_bar(aes(x='g',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()+facet_wrap('level_1')
<ggplot: (144146538201)>

facet_grid $\to$ (y~x) 꼴

td
g level_1 y
0 A mean 2.057996
1 A median 1.996399
2 A range 11.788106
3 A std 2.070265
4 B mean 2.912674
5 B median 2.855547
6 B range 5.194664
7 B std 0.987115
ggplot(td)+facet_grid('level_1~g')\
+geom_bar(aes(x='g',y='y',fill='g'),stat='identity')+coord_flip()
<ggplot: (144146386439)>

심슨의 역설

DEP=(['A1']*2+['A2']*2+['B1']*2+['B2']*2)*2 
GEN=['M']*8+['F']*8
STATE=['PASS','FAIL']*8
COUNT=[1,9,2,8,80,20,85,15,5,5,5,5,9,1,9,1]
df=pd.DataFrame({'DEP':DEP,'STATE':STATE,'GEN':GEN,'COUNT':COUNT})
df
DEP STATE GEN COUNT
0 A1 PASS M 1
1 A1 FAIL M 9
2 A2 PASS M 2
3 A2 FAIL M 8
4 B1 PASS M 80
5 B1 FAIL M 20
6 B2 PASS M 85
7 B2 FAIL M 15
8 A1 PASS F 5
9 A1 FAIL F 5
10 A2 PASS F 5
11 A2 FAIL F 5
12 B1 PASS F 9
13 B1 FAIL F 1
14 B2 PASS F 9
15 B2 FAIL F 1

시각화1 : 성별 전체합격률

df.groupby(['GEN','STATE']).agg({'COUNT':np.sum})
COUNT
GEN STATE
F FAIL 12
PASS 28
M FAIL 52
PASS 168
df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).index
MultiIndex([('F', 'FAIL'),
            ('F', 'PASS'),
            ('M', 'FAIL'),
            ('M', 'PASS')],
           names=['GEN', 'STATE'])
  • index가 tuple로 MultiIndex로 들어가있음
df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()
GEN STATE COUNT
0 F FAIL 12
1 F PASS 28
2 M FAIL 52
3 M PASS 168
df.groupby(['GEN']).agg({'COUNT':np.sum}).reset_index().rename(columns={'COUNT':'SUM'})
GEN SUM
0 F 40
1 M 220

_df1=df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()
_df2=df.groupby(['GEN']).agg({'COUNT':np.sum}).reset_index().rename(columns={'COUNT':'SUM'})

방법 1

def f(x): 
    if x=='F':
        return 40 
    if x=='M':
        return 220 
_df1['SUM']=list(map(f,_df1.GEN))
_df1
GEN STATE COUNT SUM
0 F FAIL 12 40
1 F PASS 28 40
2 M FAIL 52 220
3 M PASS 168 220

- 방법 2

_df1=df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()
_df1
GEN STATE COUNT
0 F FAIL 12
1 F PASS 28
2 M FAIL 52
3 M PASS 168
_df2
GEN SUM
0 F 40
1 M 220
def f(x): 
    return lambda x: _df2.query('GEN == @x').SUM.item()
_df1['SUM']=list(map(f(_df1.GEN),_df1.GEN))
_df1
GEN STATE COUNT SUM
0 F FAIL 12 40
1 F PASS 28 40
2 M FAIL 52 220
3 M PASS 168 220

- 방법 3

_df1=df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()
pd.merge(_df1,_df2)
_df1.merge(_df2)
_df2.merge(_df1)

td=_df2.merge(_df1)
td
GEN SUM STATE COUNT
0 F 40 FAIL 12
1 F 40 PASS 28
2 M 220 FAIL 52
3 M 220 PASS 168
td['PROP']=td.COUNT/td.SUM
td
GEN SUM STATE COUNT PROP
0 F 40 FAIL 12 0.300000
1 F 40 PASS 28 0.700000
2 M 220 FAIL 52 0.236364
3 M 220 PASS 168 0.763636
ggplot(td.query('STATE=="PASS"'))+geom_bar(aes(x='GEN',y='PROP',fill='GEN'),stat='identity')
<ggplot: (144145134400)>

시각화2: 학과별 성별에 따른 합격률

df
DEP STATE GEN COUNT
0 A1 PASS M 1
1 A1 FAIL M 9
2 A2 PASS M 2
3 A2 FAIL M 8
4 B1 PASS M 80
5 B1 FAIL M 20
6 B2 PASS M 85
7 B2 FAIL M 15
8 A1 PASS F 5
9 A1 FAIL F 5
10 A2 PASS F 5
11 A2 FAIL F 5
12 B1 PASS F 9
13 B1 FAIL F 1
14 B2 PASS F 9
15 B2 FAIL F 1
td=df.groupby(['DEP','GEN']).agg({'COUNT':sum}).reset_index()\
.rename(columns={'COUNT':'SUM'}).merge(df)
td
DEP GEN SUM STATE COUNT
0 A1 F 10 PASS 5
1 A1 F 10 FAIL 5
2 A1 M 10 PASS 1
3 A1 M 10 FAIL 9
4 A2 F 10 PASS 5
5 A2 F 10 FAIL 5
6 A2 M 10 PASS 2
7 A2 M 10 FAIL 8
8 B1 F 10 PASS 9
9 B1 F 10 FAIL 1
10 B1 M 100 PASS 80
11 B1 M 100 FAIL 20
12 B2 F 10 PASS 9
13 B2 F 10 FAIL 1
14 B2 M 100 PASS 85
15 B2 M 100 FAIL 15
td['PROP']=td.COUNT/td.SUM
td
DEP GEN SUM STATE COUNT PROP
0 A1 F 10 PASS 5 0.50
1 A1 F 10 FAIL 5 0.50
2 A1 M 10 PASS 1 0.10
3 A1 M 10 FAIL 9 0.90
4 A2 F 10 PASS 5 0.50
5 A2 F 10 FAIL 5 0.50
6 A2 M 10 PASS 2 0.20
7 A2 M 10 FAIL 8 0.80
8 B1 F 10 PASS 9 0.90
9 B1 F 10 FAIL 1 0.10
10 B1 M 100 PASS 80 0.80
11 B1 M 100 FAIL 20 0.20
12 B2 F 10 PASS 9 0.90
13 B2 F 10 FAIL 1 0.10
14 B2 M 100 PASS 85 0.85
15 B2 M 100 FAIL 15 0.15
ggplot(td.query('STATE=="PASS"'))\
+geom_bar(aes(x='GEN',y='PROP',fill='GEN'),stat='identity')\
+facet_wrap('DEP')
<ggplot: (129899444131)>