기본 콘텐츠로 건너뛰기

Python Gower

# Gower 모듈 설치
# 범주형, 연속형 데이터를 가진 행들간의 비유사도 계산
!pip install gower

예제
customers = {
    'age':[22,25,30,34,45,34,50,47,59,62],
    'gender':['m','m','f','m','f','m','f','f','m','f'],
    'marriage':['y','n','n','y','y','n','y','y','n','y'],
    'salary':[3400,3500,4300,3900,4020,4800, 5030,2900,4500,3400],
    'children':[True, False,False,True,True,False,True,False,True,False],
    'purchase_type':['low','low','low','heavy','heavy','low','heavy','low','heavy','low']
}

import pandas as pd
df = pd.DataFrame(customers)
df.index = ['user01','user02','user03','user04','user05','user06','user07','user08','user09','user10']
df
agegendermarriagesalarychildrenpurchase_type
user0122my3400Truelow
user0225mn3500Falselow
user0330fn4300Falselow
user0434my3900Trueheavy
user0545fy4020Trueheavy
user0634mn4800Falselow
user0750fy5030Trueheavy
user0847fy2900Falselow
user0959mn4500Trueheavy
user1062fy3400Falselow

import gower
# 비유사도
gower_matrix = gower.gower_matrix(df) # 각 행간의 비유사도 계산

df_gower = pd.DataFrame(gower_matrix,index=df.index,columns=df.index)
df_gower
user01user02user03user04user05user06user07user08user09user10
user010.0000000.3536580.6037560.2557900.4776800.4928800.5775430.4766240.5735720.500000
user020.3536580.0000000.2500980.5687990.7906890.1392210.8905520.4719480.5532470.495325
user030.6037560.2500980.0000000.7146320.5844090.2224570.6404540.3470460.6364830.370423
user040.2557900.5687990.7146320.0000000.2218900.5704230.3217530.6324140.3177820.655790
user050.4776800.7906890.5844090.2218900.0000000.7735330.0998630.4293040.4292250.452680
user060.4928800.1392210.2224570.5704230.7735330.0000000.7513300.5361700.4609740.559546
user070.5775430.8905520.6404540.3217530.0998630.7513300.0000000.5125000.4123040.510876
user080.4766240.4719480.3470460.6324140.4293040.5361700.5125000.0000000.8418620.101624
user090.5735720.5532470.6364830.3177820.4292250.4609740.4123040.8418620.0000000.765239
user100.5000000.4953250.3704230.6557900.4526800.5595460.5108760.1016240.7652390.000000

from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.3, min_samples=2, metric='precomputed') # metric='precomputed' : 미리 계산함
dbscan.fit(df_gower)

dbscan.labels_
array([ 0,  1,  1,  0,  0,  1,  0,  2, -1,  2], dtype=int64)

df['cluster'] = dbscan.labels_
df
agegendermarriagesalarychildrenpurchase_typecluster
user0122my3400Truelow0
user0225mn3500Falselow1
user0330fn4300Falselow1
user0434my3900Trueheavy0
user0545fy4020Trueheavy0
user0634mn4800Falselow1
user0750fy5030Trueheavy0
user0847fy2900Falselow2
user0959mn4500Trueheavy-1
user1062fy3400Falselow2

이 블로그의 인기 게시물