# Gower 모듈 설치
# 범주형, 연속형 데이터를 가진 행들간의 비유사도 계산
!pip install gower
예제
customers = {
'age':[22,25,30,34,45,34,50,47,59,62],
'gender':['m','m','f','m','f','m','f','f','m','f'],
'marriage':['y','n','n','y','y','n','y','y','n','y'],
'salary':[3400,3500,4300,3900,4020,4800, 5030,2900,4500,3400],
'children':[True, False,False,True,True,False,True,False,True,False],
'purchase_type':['low','low','low','heavy','heavy','low','heavy','low','heavy','low']
}
import pandas as pd
df = pd.DataFrame(customers)
df.index = ['user01','user02','user03','user04','user05','user06','user07','user08','user09','user10']
df
age | gender | marriage | salary | children | purchase_type | |
---|---|---|---|---|---|---|
user01 | 22 | m | y | 3400 | True | low |
user02 | 25 | m | n | 3500 | False | low |
user03 | 30 | f | n | 4300 | False | low |
user04 | 34 | m | y | 3900 | True | heavy |
user05 | 45 | f | y | 4020 | True | heavy |
user06 | 34 | m | n | 4800 | False | low |
user07 | 50 | f | y | 5030 | True | heavy |
user08 | 47 | f | y | 2900 | False | low |
user09 | 59 | m | n | 4500 | True | heavy |
user10 | 62 | f | y | 3400 | False | low |
import gower
# 비유사도
gower_matrix = gower.gower_matrix(df) # 각 행간의 비유사도 계산
df_gower = pd.DataFrame(gower_matrix,index=df.index,columns=df.index)
df_gower
user01 | user02 | user03 | user04 | user05 | user06 | user07 | user08 | user09 | user10 | |
---|---|---|---|---|---|---|---|---|---|---|
user01 | 0.000000 | 0.353658 | 0.603756 | 0.255790 | 0.477680 | 0.492880 | 0.577543 | 0.476624 | 0.573572 | 0.500000 |
user02 | 0.353658 | 0.000000 | 0.250098 | 0.568799 | 0.790689 | 0.139221 | 0.890552 | 0.471948 | 0.553247 | 0.495325 |
user03 | 0.603756 | 0.250098 | 0.000000 | 0.714632 | 0.584409 | 0.222457 | 0.640454 | 0.347046 | 0.636483 | 0.370423 |
user04 | 0.255790 | 0.568799 | 0.714632 | 0.000000 | 0.221890 | 0.570423 | 0.321753 | 0.632414 | 0.317782 | 0.655790 |
user05 | 0.477680 | 0.790689 | 0.584409 | 0.221890 | 0.000000 | 0.773533 | 0.099863 | 0.429304 | 0.429225 | 0.452680 |
user06 | 0.492880 | 0.139221 | 0.222457 | 0.570423 | 0.773533 | 0.000000 | 0.751330 | 0.536170 | 0.460974 | 0.559546 |
user07 | 0.577543 | 0.890552 | 0.640454 | 0.321753 | 0.099863 | 0.751330 | 0.000000 | 0.512500 | 0.412304 | 0.510876 |
user08 | 0.476624 | 0.471948 | 0.347046 | 0.632414 | 0.429304 | 0.536170 | 0.512500 | 0.000000 | 0.841862 | 0.101624 |
user09 | 0.573572 | 0.553247 | 0.636483 | 0.317782 | 0.429225 | 0.460974 | 0.412304 | 0.841862 | 0.000000 | 0.765239 |
user10 | 0.500000 | 0.495325 | 0.370423 | 0.655790 | 0.452680 | 0.559546 | 0.510876 | 0.101624 | 0.765239 | 0.000000 |
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.3, min_samples=2, metric='precomputed') # metric='precomputed' : 미리 계산함
dbscan.fit(df_gower)
dbscan.labels_
array([ 0, 1, 1, 0, 0, 1, 0, 2, -1, 2], dtype=int64)
df['cluster'] = dbscan.labels_
df
age | gender | marriage | salary | children | purchase_type | cluster | |
---|---|---|---|---|---|---|---|
user01 | 22 | m | y | 3400 | True | low | 0 |
user02 | 25 | m | n | 3500 | False | low | 1 |
user03 | 30 | f | n | 4300 | False | low | 1 |
user04 | 34 | m | y | 3900 | True | heavy | 0 |
user05 | 45 | f | y | 4020 | True | heavy | 0 |
user06 | 34 | m | n | 4800 | False | low | 1 |
user07 | 50 | f | y | 5030 | True | heavy | 0 |
user08 | 47 | f | y | 2900 | False | low | 2 |
user09 | 59 | m | n | 4500 | True | heavy | -1 |
user10 | 62 | f | y | 3400 | False | low | 2 |