import pandas as pd
from pandas import Series,DataFrame




■ RANK

obj = Series([78,80,88,60,50,98,79,99,68,80,80])
obj
0     78
1     80
2     88
3     60
4     50
5     98
6     79
7     99
8     68
9     80
10    80
dtype: int64
# 기본값 , 오름차순으로 순위만 표시

obj.rank()
obj.rank(ascending=True) 
0      4.0
1      7.0
2      9.0
3      2.0
4      1.0
5     10.0
6      5.0
7     11.0
8      3.0
9      7.0
10     7.0
dtype: float64
# 내림차순으로 순위만 표시


obj.rank(ascending=False)
0      8.0
1      5.0
2      3.0
3     10.0
4     11.0
5      2.0
6      7.0
7      1.0
8      9.0
9      5.0
10     5.0
dtype: float64
DataFrame({'순위':obj.rank(ascending=False).astype(int), '점수':obj})     순위  점수
0    8  78
1    5  80
2    3  88
3   10  60
4   11  50
5    2  98
6    7  79
7    1  99
8    9  68
9    5  80
10   5  80
# method='average' : 기본값, 동점자 순위를 평균으로 부여

DataFrame({'순위':obj.rank(ascending=False, method='average').astype(int), '점수':obj})
    순위  점수
0    8  78
1    5  80
2    3  88
3   10  60
4   11  50
5    2  98
6    7  79
7    1  99
8    9  68
9    5  80
10   5  80
# 동점자 모두 작은 등수로 부여

DataFrame({'순위':obj.rank(ascending=False, method='min').astype(int), '점수':obj})
    순위  점수
0    8  78
1    4  80
2    3  88
3   10  60
4   11  50
5    2  98
6    7  79
7    1  99
8    9  68
9    4  80
10   4  80
# 동점자 모두 큰 등수로 부여

DataFrame({'순위':obj.rank(ascending=False, method='max').astype(int), '점수':obj})
    순위  점수
0    8  78
1    6  80
2    3  88
3   10  60
4   11  50
5    2  98
6    7  79
7    1  99
8    9  68
9    6  80
10   6  80
# 동점자 순서대로 순위 부여

DataFrame({'순위':obj.rank(ascending=False, method='first').astype(int), '점수':obj})
    순위  점수
0    8  78
1    4  80
2    3  88
3   10  60
4   11  50
5    2  98
6    7  79
7    1  99
8    9  68
9    5  80
10   6  80
# 순위 갮없이 부여

df = DataFrame({'순위':obj.rank(ascending=False, method='dense').astype(int), '점수':obj})
    순위  점수
0    6  78
1    4  80
2    3  88
3    8  60
4    9  50
5    2  98
6    5  79
7    1  99
8    7  68
9    4  80
10   4  80
# 정렬

df.sort_values(by='순위')


    순위  점수
7    1  99
5    2  98
2    3  88
1    4  80
9    4  80
10   4  80
6    5  79
0    6  78
8    7  68
3    8  60
4    9  50




obj = Series([70,60,80,None,90])
# 기본값 , 오름차순 정렬일 때 None은 맨 뒤에 위치

obj.sort_values()

obj.sort_values(ascending=True, na_position='last')
1    60.0
0    70.0
2    80.0
4    90.0
3     NaN
dtype: float64
# 내림차순

obj.sort_values(ascending=False)
4    90.0
2    80.0
0    70.0
1    60.0
3     NaN
dtype: float64
# 내림차순, None을 맨뒤에 위치

obj.sort_values(ascending=False, na_position='last')
4    90.0
2    80.0
0    70.0
1    60.0
3     NaN
dtype: float64
# None값 맨 앞에 위치

obj.sort_values(ascending=True, na_position='first')
3     NaN
1    60.0
0    70.0
2    80.0
4    90.0
dtype: float64
# Nonerkqt 맨 위에 위치, 내림차순 정렬

obj.sort_values(ascending=False, na_position='first')
3     NaN
4    90.0
2    80.0
0    70.0
1    60.0
dtype: float64
# 기본값, 오름차순, NaN 무시

obj.rank()
obj.rank(ascending=True, na_option='keep')
0    2.0
1    1.0
2    3.0
3    NaN
4    4.0
dtype: float64
# 결측치를 1등으로 만든다.
obj.rank(ascending=True, na_option='top')
0    2.0
1    1.0
2    3.0
3    NaN
4    4.0
dtype: float64
# 결측치를 꼴등으로 만든다.

obj.rank(ascending=True, na_option='bottom')
0    3.0
1    2.0
2    4.0
3    1.0
4    5.0
dtype: float64
# 내림차순, 결측치는 꼴등

obj.rank(ascending=False, na_option='bottom')
0    3.0
1    2.0
2    4.0
3    1.0
4    5.0
# 결측치를 꼴등으로 만든다.

obj.rank(ascending=True, na_option='bottom')
0    2.0
1    1.0
2    3.0
3    5.0
4    4.0
dtype: float64
# 내림차순, 결측치는 꼴등

obj. rank(ascending=False, na_option='bottom')
0    3.0
1    4.0
2    2.0
3    5.0
4    1.0
dtype: float64




df = DataFrame({'영어':[60,80,70],'수학':[50,60,86]}, index=['박찬호','손흥민','이문세'])
df
     영어  수학
박찬호  60  50
손흥민  80  60
이문세  70  86
# 기본값, 열기준으로 오름차순 순위

df.rank()

df.rank(ascending=True, axis=0)
      영어   수학
박찬호  1.0  1.0
손흥민  3.0  2.0
이문세  2.0  3.0
# 행기준 오름차순 순위

df.rank(ascending=True, axis=1)
      영어   수학
박찬호  2.0  1.0
손흥민  2.0  1.0
이문세  1.0  2.0
# 열기준 내림차순 순위

df.rank(ascending=False, axis=0)
      영어   수학
박찬호  3.0  3.0
손흥민  1.0  2.0
이문세  2.0  1.0
# 행기준 내림차순 순위

df.rank(ascending=False, axis=1)
      영어   수학
박찬호  1.0  2.0
손흥민  1.0  2.0
이문세  2.0  1.0




# top-N 분석 ★SQL

emp = pd.read_csv("C:\\data\\emp.csv")
emp.info()

SELECT *
FROM (SELECT rank() over(order by salary desc) rank,
             employee_id,
             salary 
      FROM emp)
WHERE rank <= 10;

emp['rank'] = emp['SALARY'].rank(ascending=False, method='min').astype(int)
top_10 = emp[emp['rank'] <= 10][['rank','EMPLOYEE_ID','SALARY']]
top_10.sort_values(by='rank').reset_index(drop=True)



SELECT *
FROM (SELECT dense_rank() over(order by salary desc) rank,
             employee_id,
             salary
      FROM emp)
WHERE rank <= 10;

emp['rank'] = emp['SALARY'].rank(ascending=False, method='dense').astype(int)
top_10 = emp[emp['rank'] <= 10][['rank','EMPLOYEE_ID','SALARY']]
top_10.sort_values(by='rank').reset_index(drop=True)





■ pandas 날짜

import datetime
datetime.datetime.now()


pd.datetime.now()   # 오류, 구버전
pd.Timestamp.now()
pd.Timestamp.today()

pd.Timestamp(year=2023,month=11,day=21)
pd.Timestamp(year=2023,month=11,day=21,hour=10,minute=30,second=20,microsecond=100,tz='Asia/Seoul')

pd.Timestamp.now().year
pd.Timestamp.now().month
pd.Timestamp.now().day
pd.Timestamp.now().hour
pd.Timestamp.now().minute
pd.Timestamp.now().second
pd.Timestamp.now().microsecond
pd.Timestamp.now().dayofweek    # 숫자 요일 0:월 ~ 6:일
pd.Timestamp.now().weekday      # 0:월 ~ 6:일
pd.Timestamp.now().day_name()   # 영문 요일
pd.Timestamp.now().quarter      # 분기
pd.Timestamp.now().date()       # 날짜 정보



# char -> timestamp 변환
pd.to_datetime('2023-11-21')    # Timestamp('2023-11-21 00:00:00')
pd.to_datetime('2023/11/21')
pd.to_datetime('20231121')
pd.to_datetime('20231121103050',format='%Y%m%d%H%M%S') # Timestamp('2023-11-21 10:30:50')


pd.to_datetime('2023-1-1') - pd.Timestamp.now()     # Timedelta('-325 days +13:17:16.664067')
pd.Timestamp.now() - pd.to_datetime('2023-1-1')     # Timedelta('324 days 10:42:55.096248')
(pd.Timestamp.now() - pd.to_datetime('2023-1-1')).days  # 324

pd.Timestamp.now() + pd.Timedelta('10 days')    # 날짜 + 일수 = 날짜
pd.Timestamp.now() - pd.Timedelta('10 days')    # 날짜 - 일수 = 날짜
pd.Timestamp.now() + pd.Timedelta('10 hours')    # 날짜 + 시간 = 날짜 시간
pd.Timestamp.now() + pd.Timedelta('100 minutes')    # 날짜 + 분 = 날짜 시간
pd.Timestamp.now() + pd.Timedelta('10 min')    # 날짜 + 분 = 날짜 시간
pd.Timestamp.now() + pd.Timedelta('100 second')    # 날짜 + 초 = 날짜 시간
pd.Timestamp.now() + pd.Timedelta('100 sec')    # 날짜 + 초 = 날짜 시간
pd.Timestamp.now() + pd.Timedelta('10 days 10 hours 100 min 3600 sec')    # 날짜 시간
pd.Timestamp.now() + pd.Timedelta('10 days 10:100:3600')    # 날짜 시간

pd.Timestamp.now() + pd.Timedelta('years=1,months=2')   # 오류
pd.Timestamp.now() + pd.DateOffset(years=1,months=2)    



emp.info()

emp[emp['HIRE_DATE'] < '2003-01-01'][['HIRE_DATE']]

emp[pd.to_datetime(emp['HIRE_DATE'],format='%Y-%m-%d') < pd.to_datetime('2003-01-01',format='%Y-%m-%d')][['HIRE_DATE']]

emp['HIRE_DATE']    # dtype: object
emp['HIRE_DATE'] = pd.to_datetime(emp['HIRE_DATE'],format='%Y-%m-%d')
emp['HIRE_DATE']    # dtype: datetime64[ns]

emp['HIRE_DATE'].year       # 단일값이 아니라서 오류
emp['HIRE_DATE'][0].year

emp['HIRE_DATE'].dt.year
emp['HIRE_DATE'].dt.day
emp['HIRE_DATE'].dt.dayofweek
emp['HIRE_DATE'].dt.weekday
...

emp[emp['HIRE_DATE'].dt.dayofweek == 0]  # 월요일 입사자


[문제] 입사년도별 총액 급여를 출력해주세요.
emp['SALARY'].groupby(emp['HIRE_DATE'].dt.year).sum()

 

[문제] 입사요일별 인원수 출력해주세요.(단, 한글요일로 출력)
week = emp['EMPLOYEE_ID'].groupby(emp['HIRE_DATE'].dt.dayofweek).count()
week    # Seires
1) index를 바로 수정
week.index = ['월요일','화요일','수요일','목요일','금요일','토요일','일요일']
week
2) lambda 프로그램
week.index    # index = list()
apply(lambda arg : '월화수목금토일'[arg] + '요일')    # apply = Series or DataFrame

week.index = Series(week.index).apply(lambda arg : '월화수목금토일'[arg] + '요일')
week
3) value_counts() = groupby + count()
emp['HIRE_DATE'].dt.weekday
emp['HIRE_DATE'].dt.weekday.value_counts()
week = emp['HIRE_DATE'].dt.weekday.value_counts().sort_index()
week.index = Series(week.index).apply(lambda arg : '월화수목금토일'[arg] + '요일')
week
4) rename
week = week.rename({0:'월요일',1:'화요일',2:'수요일',3:'목요일',4:'금요일',5:'토요일',6:'일요일'})
week



# value_counts()  = groupby + count()

emp['DEPARTMENT_ID'].value_counts()
emp.DEPARTMENT_ID.value_counts()


[문제] 년도, 분기별 급여의 총액을 출력해주세요.
aa = emp['SALARY'].groupby([emp['HIRE_DATE'].dt.year, emp['HIRE_DATE'].dt.quarter]).sum()
aa.index.names = ['year','quarter']
aa = aa.unstack().fillna(0).astype(int)
aa.columns.name = 'quarter'
aa

 




■ reshaping data 데이터 재구조화

unstack() : 인덱스 값을 컬럼으로 보내는 역할
x.unstack()
x.unstack(level=0)  # 지정된 인덱스의 칼럼으로 이동
x.unstack(level=1)  # 기본값
x.unstack(level='quarter')  # 지정된 인덱스이름의 칼럼으로 이동
x.unstack(level='year')

result = x.unstack()
result


#stack(): 칼럼을 인덱스로 보내는 역할
result.stack()  # 세로방향으로 전환


result.columns = ['1분기','2분기','3분기','4분기']
result

result.index.name = '년도'
result

result.fillna(0,inplace=True)
result

result.stack()





■ pivot
피봇테이블(pivot_table)이란 데이터 열 중에서 두개의 열을 각각 행 인덱스, 열 인덱스로 사용하여
데이터 조회하여 펼처놓은 것을 의미한다.
교차테이블 생성

df.pivot(columns=열, index=행, values=값)

pd.pivot_table(data=대상테이블,
               index=행,
               columns=열,
               values=값,
               aggfunc=집계함수
               )


p = df.pivot(columns='quarter', index='year', values='SALARY')
p
p.stack()


bb = pd.pivot_table(data=emp,
               index=emp['HIRE_DATE'].dt.year,
               columns=emp['HIRE_DATE'].dt.quarter,
               values='SALARY',
               aggfunc='sum')
bb.columns.name = 'quarter'
bb.index.name = 'year'
bb = bb.fillna(0).astype(int)
bb


pd.pivot_table(data=emp,
               index=emp['HIRE_DATE'].dt.year,
               columns=emp['HIRE_DATE'].dt.quarter,
               values='SALARY',
               aggfunc=['sum','mean']).fillna(0)





■ matplotlib
- 시각화 라이브러리

1. pie chart 원 그래프
- 범주형 자료에 대한 그래프를 그릴 때 사용
- 데이터의 구성 요소들이 전체 데이터에서 어느 정도 비율을 차지하는지 파이 조각으로 보여주는 시각화 방법
- 원을 그린 후 그 원에 각 계급의 상대도수에 대한 면적 또는 부분으로 나눈다.
ex) 입사요일, job_id에 대한 비율, 부서코드에 대한 빈도, 선호도 조사

import matplotlib.pyplot as plt


# 한글폰트 인식
from matplotlib import font_manager, rc

font_name = font_manager.FontProperties(fname='C:\\Windows\\Fonts\\gulim.ttc').get_name()
rc('font',family = font_name)


선호도   도수     상대도수
------  ------  --------
좋다      45      0.25
보통      25      0.20
싫다      20      0.20
무응답    10      0.10
         100        1

labels = ['좋다','보통','싫다','무응답']
frequency = [45,25,20,10]
survey = DataFrame({'frequency':frequency},index=labels)
survey.frequency.plot(kind='pie',
                      colors=['skyblue','red','green','bisque'],
                      autopct='%.1f%%',
                           # 부채꼴 안에 표시될 숫자 형식
                      label='',
                      title='고객만족도조사',
                      fontsize = 15,
                      legend=True,      # 범례
                      figsize=(7,5),     # 가로길이, 세로길어
                      startangle=90,
                      textprops={'fontsize':10,'color':'blue'}  # 글자
                      )
plt.pie(survey['frequency'],
        shadow=True,
        startangle=90,
        autopct='%.1f%%',
        explode=(0.1,0,0,0))
plt.legend(labels=survey.index, loc='lower left')
plt.title('고객만족도조사', size=15)

 

[문제] 요일별 입사 인원수에 대해서 pie chart 시각화 해주세요.
week = emp['HIRE_DATE'].dt.weekday.value_counts().sort_index()
week.index = ['월요일','화요일','수요일','목요일','금요일','토요일','일요일']

plt.pie(week,
        autopct='%.1f%%',       # 값 표시
        startangle=90,          # 시작하는 각도
        counterclock=False,     # 시계방향으로 표시
        labels=week.index,
        wedgeprops={'width':0.7, 'edgecolor':'white', 'linewidth':3}
        )                       # 라인옵션 : 도넛모양크기, 라인색상, 라인두께 
plt.title('요일별 입사 인원수', size=15)
plt.show()




2. bar plot 막대 그래프
- 각 범주에서 도수의 크기를 막대 높이로 표현한 그래프이다.
- 항목별 비교할 때 유용하다.

x = [1,2,3]
x = ['SQL','PL/SQL','PYTHON']
y = [90,75,80]
plt.bar(x=x, height=y, color=['orange','green','blue'])
plt.title('과목별 점수 현황', size=10)
plt.xlabel('과목',size=7)
plt.ylabel('점수',size=7)
plt.show()


[문제] 부서별 입사인원수를 막대그래프로 시각화 해주세요.
    단, 소속부서가 없는 사원의 인원수도 출력해주세요.

dept_cnt = emp['DEPARTMENT_ID'].fillna(999).value_counts()
dept_cnt.sort_index(inplace=True)
dept_cnt = dept_cnt.reset_index()
dept_cnt.columns = ['부서','도수']
dept_cnt['부서'] = dept_cnt['부서'].astype(int)
dept_cnt[dept_cnt['부서'] == 999]
dept_cnt.loc[dept_cnt['부서'] == 999,'부서'] = '부서없음'
dept_cnt

# 수직 막대그래프
plt.bar(x=dept_cnt.index, height=dept_cnt['도수'])        # x축은 오브젝트 타입 불가 -> 인덱스 사용
plt.xticks(dept_cnt.index, dept_cnt['부서'],rotation=45)  # 여기서 x축 변경
plt.yticks(range(0,55,5))                                # y축 간격 변경
title_font = {'fontsize':20,'fontweight':'bold'}
plt.title('부서별 입사인원수', fontdict=title_font, pad=10)     # pad : 제목과 그래프 간격
plt.xlabel('부서')
plt.ylabel('인원수')
plt.show()

# 수평 막대그래프
plt.barh(y=dept_cnt.index, width=dept_cnt['도수'])
plt.yticks(dept_cnt.index, dept_cnt['부서'])
plt.xticks(range(0,55,5))
title_font = {'fontsize':20,'fontweight':'bold'}
plt.title('부서별 입사인원수', fontdict=title_font, pad=10)
plt.xlabel('도수')
plt.ylabel('부서')
plt.show()

# 판다스 수직 막대그래프
dept_cnt.도수.plot(kind='bar')
plt.xticks(dept_cnt.index, dept_cnt['부서'],rotation=45)  # 여기서 x축 변경
plt.yticks(range(0,55,5))                                # y축 간격 변경
title_font = {'fontsize':20,'fontweight':'bold'}
plt.title('부서별 입사인원수', fontdict=title_font, pad=10)     # pad : 제목과 그래프 간격
plt.xlabel('부서')
plt.ylabel('도수')
plt.show()

# 판다스 수평 막대그래프
dept_cnt.도수.plot(kind='barh')
plt.yticks(dept_cnt.index, dept_cnt['부서'])
plt.xticks(range(0,55,5))
title_font = {'fontsize':20,'fontweight':'bold'}
plt.title('부서별 입사인원수', fontdict=title_font, pad=10)
plt.ylabel('부서')
plt.xlabel('도수')
plt.show()


data = {'홍길동':[15,13,11], '제임스':[13,14,15], '하든':[10,9,12]}
data
df = DataFrame(data=data, index=[2020, 2021, 2022])
df
df['홍길동'].plot(kind='bar')
df['홍길동'].plot(kind='barh')

> 1개 칼럼
# 그룹형 막대그래프
df.plot(kind='bar')
df.plot(kind='barh')

> 다수 칼럼
# 스택형 막대그래프
df.plot(kind='bar', stacked=True)
df.plot(kind='barh', stacked=True)

> 다수 칼럼 누적형



[문제] 분기별 입사 인원수를 원형차트, 막대그래프로 시작화해주세요.

qtr = emp.HIRE_DATE.dt.quarter.value_counts()
qtr

1) 원형차트

plt.pie(qtr,
        autopct='%.1f%%',
        )
plt.legend(labels=[str(i)+'분기' for i in qtr.index],loc='lower center',ncol=4)    # ncol=4 : 범례를 4개열로 표시
plt.title('분기별 입사 인원 비율')
plt.show()



plt.subplot(행, 열, 위치)     # 여러개의 그래프를 하나의 그림에 나타내도록하는 함수


2) 막대그래프

# 수직 막대 그래프
plt.subplot(1,2,1)  # 1행,2열,1번째 위치
plt.bar(x=qtr.index, height=qtr)
plt.xticks(qtr.index, [str(i)+'분기' for i in qtr.index], rotation=0)
plt.yticks(range(0, 50, 5))
plt.title('분기별 입사 인원수')
plt.xlabel('분기')
plt.ylabel('인원수')
for i in qtr.index:
    plt.text(i-0.1,qtr[i]+0.3,qtr[i])         # 막대에 숫자값 표시
plt.subplot(2,1,1)
p = plt.bar(x=qtr.index, height=qtr, yerr=1)
plt.xticks(qtr.index, [str(i)+'분기' for i in qtr.index])
plt.bar_label(p, padding=1)

 

# 수평 막대 그래프
plt.subplot(1,2,2)  # 1행,2열,2번째 위치
plt.barh(y=qtr.index, width=qtr)
plt.yticks(qtr.index, [str(i)+'분기' for i in qtr.index], rotation=0)
plt.xticks(range(0, 50, 5))
plt.title('분기별 입사 인원수')
plt.ylabel('분기')
plt.xlabel('인원수')
for i in qtr.index:
    plt.text(qtr[i],i,qtr[i])        # 막대에 숫자값 표시
plt.subplot(2,1,2)
p = plt.barh(y=qtr.index, width=qtr, xerr=1)
plt.yticks(qtr.index, labels=[str(i)+'분기' for i in qtr.index])
plt.bar_label(p, padding=1)