33 PYTHON 그래프, CLASS

ryunyryuny 2023. 11. 22. 17:50

# pandas
import pandas as pd
from pandas import Series,DataFrame

# pandas_csv
emp = pd.read_csv("C:\\data\\emp.csv")
emp.info()

# 문자형 >> 날짜형
emp.HIRE_DATE = pd.to_datetime(emp.HIRE_DATE)

# matplotlib
import matplotlib.pyplot as plt

# matplotlib_한글폰트
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname='C:\\Windows\\Fonts\\malgun.ttf').get_name()
rc('font',family = font_name)

# numpy
import numpy as np

3. line plot
- 선을 그리는 그래프
- 시간, 순서 등에 따라 어떻게 변하는지를 보여주는 그래프

years = emp.HIRE_DATE.dt.year.value_counts() years.sort_index(inplace=True)
# pandas years.plot() plt.xticks(ticks=years.index, labels=[str(i)+'년' for i in years.index], rotation=45) plt.xlabel('') plt.ylabel('인원수', size=10) plt.title('년도별 입사현황', size=20) plt.show()
# linspace : 구간 내 숫자를 생성하는 함수, numpy np.linspace(start=0, stop=1, num=10, endpoint=True) # 기본값, 끝숫자를 포함한다. np.linspace(start=0, stop=1, num=10, endpoint=False) # cmap : numpy에서 지원하는 파렛트 cmap = plt.get_cmap('PuRd') cmap colors = [cmap(i) for i in np.linspace(start=0, stop=1, num=8, endpoint=True)] # 8구간 colors
# pandas_bar plt.bar(x=years.index, height=years, color=colors) plt.text(2005,29,'최대값')
plt.bar(x=years.index, height=years, color=colors) plt.annotate(text='max', xy=(2005,29), xytext=(2002,25), arrowprops={'arrowstyle':'wedge', 'facecolor':'red','color':'blue'})
# matplotlib_bar plt.plot(years.index, years, linestyle=':') plt.xticks(ticks=years.index, labels=[str(i)+'년' for i in years.index], rotation=45) plt.xlabel('') plt.ylabel('인원수', size=10) plt.title('년도별 입사현황', size=20) plt.annotate(text='max', xy=(2005,29), xytext=(2003,25), arrowprops={'arrowstyle':'wedge', 'facecolor':'red','color':'blue'}) plt.show()

# linestyle
dashdot -.
dashed  --
dotted  :
solid   -
...

[문제] 년도 분기별 입사현황 막대그래프를 생성해주세요.

df = pd.pivot_table(data = emp,
                    index = emp.HIRE_DATE.dt.year,
                    columns = emp.HIRE_DATE.dt.quarter,
                    values = 'EMPLOYEE_ID',
                    aggfunc = 'count').fillna(0)
df

# pandas
df.plot(kind = 'bar')
plt.title('년도별 분기별 입사현황', size=20)
plt.legend(labels = [str(i)+'분기' for i in df.columns],
           loc = 'upper left')
plt.xticks(ticks = range(0,8),
           labels = [str(i)+'년' for i in df.index],
           rotation = 45)
plt.xlabel('')

** frequency table 도수분포표
- 미리 구간을 설정해 각 구간의 범위안에 조사된 데이터들이 몇개씩 속하는가를 나타내는 표
- 연속형자료

ages = [21,24,26,27,29,31,37,39,40,42,45,50,51,59,60,69, ...] 계급 도수 --------------- 20(20~29) 5 30(30~39) 3 40(40~49) 3 50(50~59) 3 60(60~) 2 key value frequency_table = {20:0,30:0,40:0,50:0,60:0, ...}
↓
# 딕셔너리 생성 (key,value) ages = [21,24,26,27,29,31,37,39,40,42,45,50,51,59,60,69] frequency_table ={} for i in range(20, 61, 10): frequency_table.setdefault(i, 0) frequency_table for i in ages: if i >= 20 and i < 30: frequency_table[20] += 1 elif i >= 30 and i < 40: frequency_table[30] += 1 elif i >= 40 and i < 50: frequency_table[40] += 1 elif i >= 50 and i < 60: frequency_table[50] += 1 elif i >= 60: frequency_table[60] += 1 frequency_table	{20: 5, 30: 3, 40: 3, 50: 3, 60: 2}
수치형 자료를 범주형 자로로 변환한 후 빈도수 구하기. ages = [21,24,26,27,29,31,37,39,40,42,45,50,51,59,60,69] ages_label = [] for i in ages: if i >= 20 and i < 30: ages_label.append('20대') elif i >= 30 and i < 40: ages_label.append('30대') elif i >= 40 and i < 50: ages_label.append('40대') elif i >= 50 and i < 60: ages_label.append('50대') elif i >= 60: ages_label.append('60대') ages_label
# Series Series(ages_label).value_counts()	20대 5 30대 3 40대 3 50대 3 60대 2 Name: count, dtype: int64
# pandas pd.crosstab(index=Series(ages_label), columns='빈도수')	col_0 빈도수 row_0 20대 5 30대 3 40대 3 50대 3 60대 2
# numpy np.unique(Series(ages_label), return_counts=True) x = np.unique(Series(ages_label), return_counts=True)[0] y = np.unique(Series(ages_label), return_counts=True)[1] DataFrame({'계급':x, '도수':y})	계급 도수 0 20대 5 1 30대 3 2 40대 3 3 50대 3 4 60대 2

■ cut
- 연속형 데이터를 범주형 데이터로 변환하는 함수

ages = [21,24,26,27,29,31,37,39,40,42,45,50,51,59,60,69] bins = [20,30,40,50,60,70]
pd.cut(x=ages, bins=bins, right=True)	right=True (20, 30] 20 < ages <= 30
pd.cut(x=ages, bins=bins, right=False)	right=False [20, 30) 20 <= ages < 30
pd.cut(x=ages, bins=bins, right=False) pd.cut(x=ages, bins=bins, right=False).value_counts()	[20, 30) 5 [30, 40) 3 [40, 50) 3 [50, 60) 3 [60, 70) 2 Name: count, dtype: int64
pd.cut(x=ages, bins=bins, right=False).codes # categories index 값 > array([0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4], dtype=int8)
pd.cut(x=ages, bins=bins, right=False).categories > IntervalIndex([[20, 30), [30, 40), [40, 50), [50, 60), [60, 70)], dtype='interval[int64, left]')

4. histogram
자료가 모여있는 위치나 자료분포에 관한 대략적인 정보를 한 눈에 파악할 수 있는 그래프

ages = [21,24,26,27,29,31,37,39,40,42,45,50,51,59,60,69,22,49] bins = [20,30,40,50,60,70]
plt.hist(ages)
plt.hist(ages, bins=5) # bins=5 : 5개 구간
plt.hist(ages, bins='auto')
plt.hist(ages, bins=bins)
plt.hist(ages, bins=bins, density=True, histtype='step')
plt.hist(ages, bins=bins, orientation='horizontal', color='pink', rwidth=0.9)

# numpy_txt
weight = np.loadtxt("C:\\data\\weight.txt")

type(weight)                    # numpy.ndarray
weight.shape                    # 5행,10열

weight.reshape((1,50))          # 1행 50열
weight.reshape((50,1))          # 50행 1열

weight = weight.reshape((50,))  # 50행, 1차원
weight.shape                    # (50,)
weight.max()                    # 93.0
weight.min()                    # 52.0
weight

bins = list(range(50,101,10))   # 6개
bins                            # [50, 60, 70, 80, 90, 100]

label = [str(i)+'kg 이상' for i in bins]  # 6개
label                           # ['50kg 이상', '60kg 이상', '70kg 이상', '80kg 이상', '90kg 이상', '100kg 이상']

pd.cut(x=weight, bins=bins)     # 오류 : Categories (5, interval[int64, right]):

label.pop()                     # 마지막 1개 삭제
label                           # ['50kg 이상', '60kg 이상', '70kg 이상', '80kg 이상', '90kg 이상']

pd.cut(x=weight, bins=bins, labels=label).value_counts()
50kg 이상     2
60kg 이상    15
70kg 이상    23
80kg 이상     9
90kg 이상     1
Name: count, dtype: int64

Series(weight).describe()

count    50.000000
mean     74.040000
std       8.682518
min      52.000000
25%      68.250000
50%      74.500000  # 중앙값
75%      79.000000
max      93.000000
dtype: float64

Series(weight).quantile(0)      # min
Series(weight).quantile(.10)
Series(weight).quantile(.25)
Series(weight).quantile(.50)    # 50%
Series(weight).quantile(.90)
Series(weight).quantile(1)      # max

# numpy
np.percentile(weight, 0)    # min
np.percentile(weight, 50)    # 50%
np.percentile(weight, [0,25,50,75,100])

weight[weight >= np.percentile(weight, 95)]     # 상위 95%이상
weight[weight <= np.percentile(weight, 10)]     # 하위 10%이하

5. box plot
- 데이터가 어떤 범위에 걸쳐 존재하는지 분포를 체크할 때 사용되는 그래프
- 5개 수치 요약을 제공하는 그래프
- 이상치 데이터를 확인 할 때 좋은 그래프

plt.boxplot(weight, labels=['몸무게'])
plt.boxplot(weight, labels=['몸무게'], vert=False)

# 사분위수(Quartile)
- 데이터 표본을 동일하게 4개로 나눈 값을 확인하는 방법

min = np.percentile(weight, 0)
q1 = np.percentile(weight, 25)
q2 = np.percentile(weight, 50)
q3 = np.percentile(weight, 75)
max = np.percentile(weight, 100)

** 사분위범위(Inter Quartile Range)
- 사분위수와 3사분위수 사이의 거리

iqr = q3 - q1
iqr                     # 10.75

lower fence
lf = q1 - 1.5 * iqr
lf                      # 52.125

upper fence
uf = q3 + 1.5 * iqr
uf                      # 95.12

lf ~ uf 이 범위 안에 없으면 이상치 데이터
weight[weight < lf]     # array([52.])
weight[weight > uf]     # array([], dtype=float64)

weight[weight >= lf].min()
weight[weight <= uf].max()

plt.boxplot(weight, labels=['몸무게'], vert=False)
plt.text(weight[weight < lf][0], 1.05,
         weight[weight < lf][0], color = 'red')
plt.text(q1, 1.1, q1, color = 'red')
plt.text(q2, 1.1, q2, color = 'red')
plt.text(q3, 1.1, q3, color = 'red')
plt.text(weight[weight >= lf].min(), 1.1,
         weight[weight >= lf].min(), color = 'red')
plt.text(weight[weight <= uf].max(), 1.1,
         weight[weight <= uf].max(), color = 'red')
plt.show()

# pandas_excel
height = pd.read_excel("C:\\data\\height.xlsx")

height.info()	<class 'pandas.core.frame.DataFrame'> RangeIndex: 27 entries, 0 to 26 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 남자 27 non-null int64 1 여자 27 non-null int64 dtypes: int64(2)
height.describe()	남자 여자 count 27.000000 27.000000 mean 173.555556 162.592593 std 5.315676 5.507053 min 163.000000 152.000000 25% 170.000000 160.000000 50% 173.000000 160.000000 75% 178.000000 165.500000 max 183.000000 180.000000
plt.boxplot([height['남자'], height['여자']], labels=['남자','여자'])
plt.hist([height['남자'], height['여자']], label=['남자','여자']) plt.legend()

6. 줄기 잎 그림 (stem and leaf diagram)
- 연속형자료의 특성을 나타내고자 할 때 사용하는 그래프

Anaconda Prompt - 관리자모드
> pip install stemgraphic # 없으면 설치

import stemgraphic

stemgraphic.stem_graphic(height.남자)
stemgraphic.stem_graphic(height.여자)

7. 산점도 scatter plot
- 두 연속형 변수 사이의 관계(션형관계)를 보여주는 그래프

plt.scatter(emp['DEPARTMENT_ID'],
            emp['SALARY'],
            marker='d',
            facecolor='g')

- marker : 모양 : v(역삼각형), s(스퀘어), d(다이아몬드), ...

8. folium 지도 시각화 라이브러리

Anaconda Prompt - 관리자모드
> pip install folium # 없으면 설치

import folium

latitude = 37.498852 # 위도 longitude = 127.031775 # 경도
m = folium.Map(location = [latitude, longitude], zoom_start = 15, tiles = 'OpenStreetMap') folium.Marker(location = [latitude, longitude], popup = 'itwill', icon = folium.Icon(color = 'red',icon = 'star')).add_to(m) m.save('C:\\data\\seoul.html') # 파일로 저장
m = folium.Map(location = [latitude, longitude], zoom_start = 18) folium.CircleMarker(location = [latitude, longitude], color = 'red', radius = 30, tooltip = '학원주변').add_to(m) m.save('C:\\data\\seoul.html') # 파일로 저장

result = 0 def add(arg): global result # 글로벌 변수 선언 result += arg
<< session 1 >> add(10) result # 10 add(20) result # 30	<< session 2 >> add(10) result # 40 -> 다른 세션에서도 작업이 누적 됨 add(20) result # 60

■ 절차적(구조적) 지향 프로그램(procedural language)
- C, R, PL/SQL
- 물이 위에서 아래로 흐르는 것처럼 순차적인 처리가 중요시되며 프로그램 전체가 유기적으로
    연결되도록 만드는 프로그래밍 기법이다.
- 단점
    - 재사용할 수 없다.
    - 확장성이 떨어진다.
    - 유지보수가 어렵다.


■ 객체지향 프로그램(Object Oriented Language)
- C++, JABA, C#, PYTHON
- 구조적인 프로그래밍과 다르게 큰 문제를 작은 문제들로 해결할 수 있는 객체들을 만든 뒤
    이 객체들을 조합해서 큰 문제를 해결하는 방법

# 클래스 생성 : 설계도

class Calculator:
    def __init__(self):     # self: 자기 메모리 안에서만 사용되는 지시어
        self.result = 0      # 인스턴스 변수, private

    def add(self, arg):     # arg : 형식매개변수, Local Variable
        self.result += arg
        return self.result

# 인스턴스 생성 : 클래스를 메모리에 만들어서 사용

<< session 1 >>
s1 = Calculator()   # s1 : 인스턴스
s1.add(10)          # 10
s1.add(20)          # 30
s1.result           # 30
s1.result = 50      # 값변경 가능
s1.add(30)          # 80

<< session 2 >>
s2 = Calculator()   # s2 : 인스턴스
s2.add(100)         # 100 -> 값이 누적되지 않는다.
s2.add(200)         # 300
s2.result           # 300

__init__(self) : 생성자(메소드,함수)
                클래스를 인스턴스화 할때 자동으로 수행되는 생성자이다.
                무조건 실행
                인스턴스 변수를 초기화 해줄때 사용한다.

class hello: def __init__(self): print('오늘 하루도 수고하셨습니다.!!')	class hello: def message(self): print('오늘 하루도 수고하셨습니다.!!')
h = hello() # 오늘 하루도 수고하셨습니다.!!	h = hello() h.message() # 오늘 하루도 수고하셨습니다.!!

class Person: def myprint(self): name = '홍길동' # 로컬변수, Local Variable age = 20 # 로컬변수, Local Variable print('이름은 {}, 나이는 {}'.format(name,age))	class Person: name = '홍길동' # 인스턴스 변수, private variable age = 20 # 인스턴스 변수, private variable def myprint(self): print('이름은 {}, 나이는 {}'.format(self.name,self.age))
p1 = Person() p1.myprint() # 이름은 홍길동, 나이는 20 p1.name() # 호출불가, 로컬변수. 인스턴스 변수가 아니다.	p = Person() p.myprint() # 이름은 홍길동, 나이는 20 p.name # '홍길동', 호출 가능 p.age # 20 p.name = '박찬호' # 값 변경 가능 p.myprint() # 이름은 박찬호, 나이는 20 p.job = '엔지니어' # 변수 생성도 가능 p.job # 엔지니어

저작자표시

'PYTHON 3.11' 카테고리의 다른 글

35 PYTHON 예외사항, SQLite (1)	2023.11.27
34 PYTHON CLASS, 상속 (1)	2023.11.23
32 PYTHON RANK, PANDAS, UNSTACK, PIVOT, MATPLOTLIB (3)	2023.11.21
31 PYTHON isin, null, apply, str, 그룹함수, MERGE (1)	2023.11.20
30 PYTHON WITH, CSV, Lambda, PANDAS, SERIES, DATAFRAME (0)	2023.11.17

'PYTHON 3.11'의 다른글

현재글33 PYTHON 그래프, CLASS

Record.

33 PYTHON 그래프, CLASS

'PYTHON 3.11' 카테고리의 다른 글

티스토리툴바