기본 콘텐츠로 건너뛰기

Python Gower

# Gower 모듈 설치
# 범주형, 연속형 데이터를 가진 행들간의 비유사도 계산
!pip install gower

예제
customers = {
    'age':[22,25,30,34,45,34,50,47,59,62],
    'gender':['m','m','f','m','f','m','f','f','m','f'],
    'marriage':['y','n','n','y','y','n','y','y','n','y'],
    'salary':[3400,3500,4300,3900,4020,4800, 5030,2900,4500,3400],
    'children':[True, False,False,True,True,False,True,False,True,False],
    'purchase_type':['low','low','low','heavy','heavy','low','heavy','low','heavy','low']
}

import pandas as pd
df = pd.DataFrame(customers)
df.index = ['user01','user02','user03','user04','user05','user06','user07','user08','user09','user10']
df
agegendermarriagesalarychildrenpurchase_type
user0122my3400Truelow
user0225mn3500Falselow
user0330fn4300Falselow
user0434my3900Trueheavy
user0545fy4020Trueheavy
user0634mn4800Falselow
user0750fy5030Trueheavy
user0847fy2900Falselow
user0959mn4500Trueheavy
user1062fy3400Falselow

import gower
# 비유사도
gower_matrix = gower.gower_matrix(df) # 각 행간의 비유사도 계산

df_gower = pd.DataFrame(gower_matrix,index=df.index,columns=df.index)
df_gower
user01user02user03user04user05user06user07user08user09user10
user010.0000000.3536580.6037560.2557900.4776800.4928800.5775430.4766240.5735720.500000
user020.3536580.0000000.2500980.5687990.7906890.1392210.8905520.4719480.5532470.495325
user030.6037560.2500980.0000000.7146320.5844090.2224570.6404540.3470460.6364830.370423
user040.2557900.5687990.7146320.0000000.2218900.5704230.3217530.6324140.3177820.655790
user050.4776800.7906890.5844090.2218900.0000000.7735330.0998630.4293040.4292250.452680
user060.4928800.1392210.2224570.5704230.7735330.0000000.7513300.5361700.4609740.559546
user070.5775430.8905520.6404540.3217530.0998630.7513300.0000000.5125000.4123040.510876
user080.4766240.4719480.3470460.6324140.4293040.5361700.5125000.0000000.8418620.101624
user090.5735720.5532470.6364830.3177820.4292250.4609740.4123040.8418620.0000000.765239
user100.5000000.4953250.3704230.6557900.4526800.5595460.5108760.1016240.7652390.000000

from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.3, min_samples=2, metric='precomputed') # metric='precomputed' : 미리 계산함
dbscan.fit(df_gower)

dbscan.labels_
array([ 0,  1,  1,  0,  0,  1,  0,  2, -1,  2], dtype=int64)

df['cluster'] = dbscan.labels_
df
agegendermarriagesalarychildrenpurchase_typecluster
user0122my3400Truelow0
user0225mn3500Falselow1
user0330fn4300Falselow1
user0434my3900Trueheavy0
user0545fy4020Trueheavy0
user0634mn4800Falselow1
user0750fy5030Trueheavy0
user0847fy2900Falselow2
user0959mn4500Trueheavy-1
user1062fy3400Falselow2

이 블로그의 인기 게시물

Python Sklearn make_regression

from sklearn.datasets import make_regression import matplotlib.pyplot as plt X, y = make_regression(n_samples=250, n_features=1, noise=50, random_state=2) plt.scatter(X,y, s=2) plt.show() from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split # 한글 깨짐 없이 나오게 설정 from matplotlib import rcParams # 인코딩 폰트 설정 rcParams['font.family'] = 'New Gulim' rcParams['font.size'] = 10 x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=.20, random_state=0) x_train.shape, x_test.shape, y_train.shape, y_test.shape # 모델 생성 model = LinearRegression() # 학습하기 model.fit(x_train, y_train) # 가중치, 편향치 구하기 model.coef_, model.intercept_ # (array([90.11061494]), 2.4224269924448585) # 결정 계수 model.score(x_train, y_train) # 0.789267454050733 # 추정 pred = model.predict(x_test) # 산점도 plt.scatter(x_test,y_test) plt.plot(x_test, pred, 'r-') plt.show() # 추정 model.predict([[3.0]]) # 학습할 때 주는 데이터의 형식을 따른다 # x의 최소값, 최대값을 계수와 절편을 사용하여 ...