import numpy as np
import pandas as pd
from openpyxl import load_workbook
data = {'Name' : ['S1','S2','S3'],
       'Age' : [25,28,22],
       'Score' : np.array([95,80,75])}
data
{'Name': ['S1', 'S2', 'S3'], 'Age': [25, 28, 22], 'Score': array([95, 80, 75])}
type(data)
dict
data['Name']
['S1', 'S2', 'S3']
df=pd.DataFrame(data,index=['row1','row2','row3'])
df
Name Age Score
row1 S1 25 95
row2 S2 28 80
row3 S3 22 75
  • 리스트도 df로 변경 가능
data2=[['S1',25,95],['S2',28,80],['S3',22,75]]
data2
[['S1', 25, 95], ['S2', 28, 80], ['S3', 22, 75]]
df2=pd.DataFrame(data2,index=['row1','row2','row3'],columns=['Name','Age','Score'])
df2
Name Age Score
row1 S1 25 95
row2 S2 28 80
row3 S3 22 75
  • 결론 : list와 dict은 df로 반환시킬 수 있다.

  • Subset Observation 부분 관찰해보자
df
Name Age Score
row1 S1 25 95
row2 S2 28 80
row3 S3 22 75
df[['Name']]
Name
row1 S1
row2 S2
row3 S3
df['Name']
row1    S1
row2    S2
row3    S3
Name: Name, dtype: object
df[['Name','Score']]
# 이렇게 list안에 넣어줘야 함
Name Score
row1 S1 95
row2 S2 80
row3 S3 75
# 행은 이렇게 추출할 수 없음

그렇다면 행 추출은?

df.loc[['row1']]
Name Age Score
row1 S1 25 95
df.loc['row1']
Name     S1
Age      25
Score    95
Name: row1, dtype: object
df.loc[['row1','row2']]
Name Age Score
row1 S1 25 95
row2 S2 28 80
df.loc['row1','Name']
'S1'
df['Name']
row1    S1
row2    S2
row3    S3
Name: Name, dtype: object
  • 사용하고 싶다면 이렇게 사용
df.loc[:,'Name']
row1    S1
row2    S2
row3    S3
Name: Name, dtype: object
df.loc[:,['Name']]
Name
row1 S1
row2 S2
row3 S3
df.loc[:,['Score','Name']]
Score Name
row1 95 S1
row2 80 S2
row3 75 S3
df.loc[:,'Name':'Age'] 
Name Age
row1 S1 25
row2 S2 28
row3 S3 22
df.iloc[0,0]
'S1'
df.iloc[2,0]
'S3'
df.iloc[:,[0,2]]
Name Score
row1 S1 95
row2 S2 80
row3 S3 75
df.iloc[:,0:2]
Name Age
row1 S1 25
row2 S2 28
row3 S3 22
df.iloc[::2,[0,2]]
Name Score
row1 S1 95
row3 S3 75
df.iloc[[-1],:]
Name Age Score
row3 S3 22 75
df.iloc[-1,:]
Name     S3
Age      22
Score    75
Name: row3, dtype: object
df
Name Age Score
row1 S1 25 95
row2 S2 28 80
row3 S3 22 75
df.iloc[-1::-1,:]
Name Age Score
row3 S3 22 75
row2 S2 28 80
row1 S1 25 95
df.head(2)
Name Age Score
row1 S1 25 95
row2 S2 28 80
df.tail(1)
Name Age Score
row3 S3 22 75
df.info() 
<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, row1 to row3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   Score   3 non-null      int32 
dtypes: int32(1), int64(1), object(1)
memory usage: 192.0+ bytes
df.describe()
# 통계적 수치들을 알 수 있음
Age Score
count 3.0 3.000000
mean 25.0 83.333333
std 3.0 10.408330
min 22.0 75.000000
25% 23.5 77.500000
50% 25.0 80.000000
75% 26.5 87.500000
max 28.0 95.000000
  • deep copy $\to$ copy
  • shallow copy $\to$ view
df2=df.copy()
df2
Name Age Score
row1 S1 25 95
row2 S2 28 80
row3 S3 22 75
df2.loc['row2','Score']=np.NaN
df2
Name Age Score
row1 S1 25 95.0
row2 S2 28 NaN
row3 S3 22 75.0
df
Name Age Score
row1 S1 25 95
row2 S2 28 80
row3 S3 22 75

deep copy

df2.nunique()
Name     3
Age      3
Score    2
dtype: int64
df2['Score'].nunique()
2
df2
Name Age Score
row1 S1 25 95.0
row2 S2 28 NaN
row3 S3 22 75.0
df2['Score'].value_counts()
# 각 값들의 갯수를 알 수 있음
95.0    1
75.0    1
Name: Score, dtype: int64
df3=df2.copy()
df3.loc['row3','Score']=df2.loc['row1','Score']
df3['Score'].value_counts()
95.0    2
Name: Score, dtype: int64
df3['Score'].count()
2
  • value_counts() = 각 값 별 몇 개?
  • count = 그냥 총 값이 몇 개 들어가 있는지?
df3['Age'].count()
3
df['Score'].sum()
250
df.max()
Name     S3
Age      28
Score    95
dtype: object
df['Score'].std()
10.408329997330664
df.describe()
Age Score
count 3.0 3.000000
mean 25.0 83.333333
std 3.0 10.408330
min 22.0 75.000000
25% 23.5 77.500000
50% 25.0 80.000000
75% 26.5 87.500000
max 28.0 95.000000

df4=df.copy()
df4
Name Age Score
row1 S1 25 95
row2 S2 28 80
row3 S3 22 75
df4=df4.iloc[:,[0,2,1]]
df4
Name Age Score
row1 S1 25 95
row2 S2 28 80
row3 S3 22 75

df의 열을 변경해서 df4에 저장

data={
    'class' : ['A','B','C','A','B','C','C'],
    'name' : ['S1','S2','S3','S4','S5','S6','S7'],
    'age' : [20,19,21,22,24,25,26],
    'score' : [90,95,75,80,70,85,90]}
df=pd.DataFrame(data)
df
class name age score
0 A S1 20 90
1 B S2 19 95
2 C S3 21 75
3 A S4 22 80
4 B S5 24 70
5 C S6 25 85
6 C S7 26 90
  • 이처럼 dict을 df에 활용할 수 있다.
df['score']>=80
0     True
1     True
2    False
3     True
4    False
5     True
6     True
Name: score, dtype: bool
df.loc[df['score']>=80]
class name age score
0 A S1 20 90
1 B S2 19 95
3 A S4 22 80
5 C S6 25 85
6 C S7 26 90
df.loc[df['score']>=80,['name','age']]
name age
0 S1 20
1 S2 19
3 S4 22
5 S6 25
6 S7 26
df[df['score']>=80]
class name age score
0 A S1 20 90
1 B S2 19 95
3 A S4 22 80
5 C S6 25 85
6 C S7 26 90
df['result']='NONE'
df
class name age score result
0 A S1 20 90 NONE
1 B S2 19 95 NONE
2 C S3 21 75 NONE
3 A S4 22 80 NONE
4 B S5 24 70 NONE
5 C S6 25 85 NONE
6 C S7 26 90 NONE
df.loc[df['score']>=80,'result']='PASS'
df.loc[df['score']<80,'result']='FAIL'
df
class name age score result
0 A S1 20 90 PASS
1 B S2 19 95 PASS
2 C S3 21 75 FAIL
3 A S4 22 80 PASS
4 B S5 24 70 FAIL
5 C S6 25 85 PASS
6 C S7 26 90 PASS
idx = df['result'] == 'PASS'
df.loc[idx]
class name age score result
0 A S1 20 90 PASS
1 B S2 19 95 PASS
3 A S4 22 80 PASS
5 C S6 25 85 PASS
6 C S7 26 90 PASS
df_sorted=df.loc[idx].sort_values('score',ascending=False)
df_sorted
class name age score result
1 B S2 19 95 PASS
0 A S1 20 90 PASS
6 C S7 26 90 PASS
5 C S6 25 85 PASS
3 A S4 22 80 PASS
  • excel 파일 만들고 불러오기
df_sorted.to_excel('data_sorted.xlsx',index=False)
df_import=pd.read_excel('data_sorted.xlsx')
df_import
class name age score result
0 B S2 19 95 PASS
1 A S1 20 90 PASS
2 C S7 26 90 PASS
3 C S6 25 85 PASS
4 A S4 22 80 PASS
df.groupby(by='class').mean()
# mean처리 할 수 있는 열에 대해서만 mean 처리 함
age score
class
A 21.0 85.000000
B 21.5 82.500000
C 24.0 83.333333
df.groupby(by='class').std()
age score
class
A 1.414214 7.071068
B 3.535534 17.677670
C 2.645751 7.637626

plotting

df
class name age score result
0 A S1 20 90 PASS
1 B S2 19 95 PASS
2 C S3 21 75 FAIL
3 A S4 22 80 PASS
4 B S5 24 70 FAIL
5 C S6 25 85 PASS
6 C S7 26 90 PASS
df.plot.bar('name','score')
<AxesSubplot:xlabel='name'>
df.loc[[0,2],'score']=np.NaN
df
class name age score result
0 A S1 20 NaN PASS
1 B S2 19 95.0 PASS
2 C S3 21 NaN FAIL
3 A S4 22 80.0 PASS
4 B S5 24 70.0 FAIL
5 C S6 25 85.0 PASS
6 C S7 26 90.0 PASS
df.isnull()
class name age score result
0 False False False True False
1 False False False False False
2 False False False True False
3 False False False False False
4 False False False False False
5 False False False False False
6 False False False False False
df.dropna()
# 데이터 없는 행은 다 날림
class name age score result
1 B S2 19 95.0 PASS
3 A S4 22 80.0 PASS
4 B S5 24 70.0 FAIL
5 C S6 25 85.0 PASS
6 C S7 26 90.0 PASS
value=0
df.fillna(value) ## NaN값만 value로 채워줌
class name age score result
0 A S1 20 0.0 PASS
1 B S2 19 95.0 PASS
2 C S3 21 0.0 FAIL
3 A S4 22 80.0 PASS
4 B S5 24 70.0 FAIL
5 C S6 25 85.0 PASS
6 C S7 26 90.0 PASS
df.replace(np.nan,54)
class name age score result
0 A S1 20 54.0 PASS
1 B S2 19 95.0 PASS
2 C S3 21 54.0 FAIL
3 A S4 22 80.0 PASS
4 B S5 24 70.0 FAIL
5 C S6 25 85.0 PASS
6 C S7 26 90.0 PASS
df.interpolate()
class name age score result
0 A S1 20 NaN PASS
1 B S2 19 95.0 PASS
2 C S3 21 87.5 FAIL
3 A S4 22 80.0 PASS
4 B S5 24 70.0 FAIL
5 C S6 25 85.0 PASS
6 C S7 26 90.0 PASS
  • 이렇게 위 아래 값의 평균으로 채워주기도 하는데 위아래 둘다 있는 경우에만 사용가능하다
def add_one(x):
    return x+1
add_one(1001)
1002
  • apply 활용하기
df['age']=df['age'].apply(add_one)
df
class name age score result
0 A S1 22 NaN PASS
1 B S2 21 95.0 PASS
2 C S3 23 NaN FAIL
3 A S4 24 80.0 PASS
4 B S5 26 70.0 FAIL
5 C S6 27 85.0 PASS
6 C S7 28 90.0 PASS

이렇게 df의 age를 변경해줄 수 있음

df['score'].apply(np.square)
0       NaN
1    9025.0
2       NaN
3    6400.0
4    4900.0
5    7225.0
6    8100.0
Name: score, dtype: float64

df
class name age score result
0 A S1 22 NaN PASS
1 B S2 21 95.0 PASS
2 C S3 23 NaN FAIL
3 A S4 24 80.0 PASS
4 B S5 26 70.0 FAIL
5 C S6 27 85.0 PASS
6 C S7 28 90.0 PASS
df.filter(regex='[rn]')
# n 또는 r이 들어간 columns 추출
name score result
0 S1 NaN PASS
1 S2 95.0 PASS
2 S3 NaN FAIL
3 S4 80.0 PASS
4 S5 70.0 FAIL
5 S6 85.0 PASS
6 S7 90.0 PASS
df_vertical=pd.concat([df,df])
df_vertical
class name age score result
0 A S1 22 NaN PASS
1 B S2 21 95.0 PASS
2 C S3 23 NaN FAIL
3 A S4 24 80.0 PASS
4 B S5 26 70.0 FAIL
5 C S6 27 85.0 PASS
6 C S7 28 90.0 PASS
0 A S1 22 NaN PASS
1 B S2 21 95.0 PASS
2 C S3 23 NaN FAIL
3 A S4 24 80.0 PASS
4 B S5 26 70.0 FAIL
5 C S6 27 85.0 PASS
6 C S7 28 90.0 PASS
df_vertical=pd.concat([df,df],ignore_index=True)
df_vertical
class name age score result
0 A S1 22 NaN PASS
1 B S2 21 95.0 PASS
2 C S3 23 NaN FAIL
3 A S4 24 80.0 PASS
4 B S5 26 70.0 FAIL
5 C S6 27 85.0 PASS
6 C S7 28 90.0 PASS
7 A S1 22 NaN PASS
8 B S2 21 95.0 PASS
9 C S3 23 NaN FAIL
10 A S4 24 80.0 PASS
11 B S5 26 70.0 FAIL
12 C S6 27 85.0 PASS
13 C S7 28 90.0 PASS
df_horizontal=pd.concat([df,df],axis=1)
df_horizontal
class name age score result class name age score result
0 A S1 22 NaN PASS A S1 22 NaN PASS
1 B S2 21 95.0 PASS B S2 21 95.0 PASS
2 C S3 23 NaN FAIL C S3 23 NaN FAIL
3 A S4 24 80.0 PASS A S4 24 80.0 PASS
4 B S5 26 70.0 FAIL B S5 26 70.0 FAIL
5 C S6 27 85.0 PASS C S6 27 85.0 PASS
6 C S7 28 90.0 PASS C S7 28 90.0 PASS

df.to_csv('data_text.txt',sep='\t',index=False)
# txt파일로 변환할 때 어떻게 구분해서 타이핑해넣을 것인가
# sep=\t,여기선 지금 탭으로 구분지었음
pd.read_csv('data_text.txt',delimiter='\t')
# delimiter로 txt파일이 어떻게 이루어져 있나 알려줘야함
class name age score result
0 A S1 22 NaN PASS
1 B S2 21 95.0 PASS
2 C S3 23 NaN FAIL
3 A S4 24 80.0 PASS
4 B S5 26 70.0 FAIL
5 C S6 27 85.0 PASS
6 C S7 28 90.0 PASS

from sympy import symbols
x=symbols('x')
type(x)
sympy.core.symbol.Symbol
2*x
$\displaystyle 2 x$

즉 x자체가 symbol로 들어갔음

expr=2*x
expr.subs(x,3)
$\displaystyle 6$

미분해보자

f=x**3
from sympy import diff
df1=diff(f,x)
df1
$\displaystyle 3 x^{2}$
df2=diff(df1,x)
df2
$\displaystyle 6 x$

from sympy import sin
f=sin(x)
df1=diff(f,x)
df1
$\displaystyle \cos{\left(x \right)}$

from sympy import integrate
integrate(f,(x,0,2*3.14))
$\displaystyle 5.07308662478501 \cdot 10^{-6}$

0에 가깝게 나옴

integrate(f,(x,0,3.14))
$\displaystyle 1.99999873172754$

from sympy import limit
limit(sin(x)/x,x,0)
$\displaystyle 1$

import matplotlib.pyplot as plt
from scipy import interpolate
x=np.array([1,2,3,4,5])
y=np.array([1,0.8,0.4,0.3,0.2])
plt.plot(x,y,'*')
[<matplotlib.lines.Line2D at 0x1bb1062f8b0>]
f_lin=interpolate.interp1d(x,y)
x_new = np.arange(1,5,0.1)
y_new = f_lin(x_new) 
fig,ax=plt.subplots()
ax.plot(x,y,'o',label='Data')
ax.plot(x_new,y_new,label='linear')
ax.legend()
<matplotlib.legend.Legend at 0x1bb1e842fa0>

선형 보간됨을 알 수 있다.


tck=interpolate.splrep(x,y,s=0)
y_spl=interpolate.splev(x_new,tck,der=0)
fig,ax=plt.subplots()
ax.plot(x,y,'o',label='Data')
ax.plot(x_new,y_new,label='linear')
ax.plot(x_new,y_spl,label='spline')
ax.legend()
<matplotlib.legend.Legend at 0x1bb1e8af9a0>

원형 보간 추가


BGR 플랏

import cv2 as cv
im=cv.imread('KakaoTalk_20211210_090822964.png')
plt.figure()
plt.imshow(im)
plt.title('Original')
Text(0.5, 1.0, 'Original')
  • BGR로 들어오기 때문에 RGB로 바꿔 줄 필요가 있음
rgb=cv.cvtColor(im,cv.COLOR_BGR2RGB)
plt.figure()
plt.imshow(rgb)
plt.title('RGB')
Text(0.5, 1.0, 'RGB')
GRAY=cv.cvtColor(im,cv.COLOR_BGR2GRAY)
plt.figure()
plt.imshow(GRAY,cmap='gray')
plt.title('GRAY')
Text(0.5, 1.0, 'GRAY')
blur=cv.blur(im,(100,100))
blur=cv.cvtColor(blur,cv.COLOR_BGR2RGB)
plt.subplot(121) # 가로줄 한개, 세로줄 두개, 첫번째에 놓겠다
plt.imshow(rgb)
plt.title('RGB')
plt.subplot(122) # 가로줄 한개, 세로줄 두개, 두번째에 놓겠다 
plt.imshow(blur)
plt.title('blur')
Text(0.5, 1.0, 'blur')

edges=cv.Canny(GRAY,0,100)
plt.subplot(121) # 가로줄 한개, 세로줄 두개, 첫번째에 놓겠다
plt.imshow(GRAY,cmap='gray')
plt.title('GRAY')
plt.subplot(122) # 가로줄 한개, 세로줄 두개, 두번째에 놓겠다 
plt.imshow(edges)
plt.title('edge detection')
Text(0.5, 1.0, 'edge detection')

머신러닝과 연관하여 비디오 영상의 움직이는 사물을 찾거나 번호판 또는 숫자 인식 여러 분야에서 활용이 가능하다