기본 콘텐츠로 건너뛰기

Python Sklearn load_boston

from sklearn.linear_model import LinearRegression # Linear : 선형, Regression : 회귀
from sklearn.model_selection import train_test_split # train : 학습용, test : 검증용
from sklearn.datasets import load_boston # 집 정보

예제
# 문제, 정답을 가진 데이터 : 지도학습을 위한 데이터 준비
boston = load_boston()

# Data Frame 생성
import pandas as pd
sample_boston = pd.DataFrame(boston.data,columns=boston.feature_names)
sample_boston['target'] = pd.DataFrame(boston.target)
sample_boston
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATtarget
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
.............................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.6722.4
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.0820.6
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.6423.9
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.4822.0
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.8811.9

# 결측치 확인
sample_boston.isnull().sum()
CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
target     0
dtype: int64

# 이상치 확인
sample_boston.boxplot()














# 상관 분석
sample_boston.corr().sort_values('target',ascending=False)
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATtarget
target-0.3883050.360445-0.4837250.175260-0.4273210.695360-0.3769550.249929-0.381626-0.468536-0.5077870.333461-0.7376631.000000
RM-0.2192470.311991-0.3916760.091251-0.3021881.000000-0.2402650.205246-0.209847-0.292048-0.3555010.128069-0.6138080.695360
ZN-0.2004691.000000-0.533828-0.042697-0.5166040.311991-0.5695370.664408-0.311948-0.314563-0.3916790.175520-0.4129950.360445
B-0.3850640.175520-0.3569770.048788-0.3800510.128069-0.2735340.291512-0.444413-0.441808-0.1773831.000000-0.3660870.333461
DIS-0.3796700.664408-0.708027-0.099176-0.7692300.205246-0.7478811.000000-0.494588-0.534432-0.2324710.291512-0.4969960.249929
CHAS-0.055892-0.0426970.0629381.0000000.0912030.0912510.086518-0.099176-0.007368-0.035587-0.1215150.048788-0.0539290.175260
AGE0.352734-0.5695370.6447790.0865180.731470-0.2402651.000000-0.7478810.4560220.5064560.261515-0.2735340.602339-0.376955
RAD0.625505-0.3119480.595129-0.0073680.611441-0.2098470.456022-0.4945881.0000000.9102280.464741-0.4444130.488676-0.381626
CRIM1.000000-0.2004690.406583-0.0558920.420972-0.2192470.352734-0.3796700.6255050.5827640.289946-0.3850640.455621-0.388305
NOX0.420972-0.5166040.7636510.0912031.000000-0.3021880.731470-0.7692300.6114410.6680230.188933-0.3800510.590879-0.427321
TAX0.582764-0.3145630.720760-0.0355870.668023-0.2920480.506456-0.5344320.9102281.0000000.460853-0.4418080.543993-0.468536
INDUS0.406583-0.5338281.0000000.0629380.763651-0.3916760.644779-0.7080270.5951290.7207600.383248-0.3569770.603800-0.483725
PTRATIO0.289946-0.3916790.383248-0.1215150.188933-0.3555010.261515-0.2324710.4647410.4608531.000000-0.1773830.374044-0.507787
LSTAT0.455621-0.4129950.603800-0.0539290.590879-0.6138080.602339-0.4969960.4886760.5439930.374044-0.3660871.000000-0.737663

# 산점도
plt.scatter(sample_boston['target'],sample_boston['RM'])
plt.show()














# 도면
plt.plot(sample_boston['target'],sample_boston['RM'])
plt.show()














# 라인차트
plt.bar(sample_boston['target'],sample_boston['RM'])
plt.show()














# 쌍 도면
import seaborn as sns
sns.pairplot(sample_boston, hue='target') # pastel, bright, deep, muted, colorblind, dark
plt.show()

















# 결론 : 집의 방의 숫자(RM)가 집 값에 영향을 많이 끼친다

# 데이터 만들기
X_train, X_test, y_train, y_test = train_test_split(boston['data'], boston['target'], test_size=.20, random_state=123)

# 데이터 행과 열
X_train.shape # (404, 13)
X_test.shape # (102, 13)
y_train.shape # (404,)
y_test.shape # (102,)

# 모델 만들기
model = LinearRegression()
model

# 모델을 학습 시키기
model.fit(X_train,y_train)

# 가중치 알아내기
model.coef_
array([-9.87931696e-02,  4.75027102e-02,  6.69491841e-02,  1.26954150e+00,
       -1.54697747e+01,  4.31968412e+00, -9.80167937e-04, -1.36597953e+00,
        2.84521838e-01, -1.27533606e-02, -9.13487599e-01,  7.22553507e-03,
       -5.43790245e-01])

# 편향치 알아내기
model.intercept_ # 31.835164121206343

# 결정계수
model.score(X_train, y_train) # 0.7559380876016175, 1에 가까울 수록 정확

# 추정하기
pred = model.predict(X_test)

plt.scatter(y_test, pred)
plt.title("모델이 추정한 값과 실제값의 차이")
plt.xlabel("y_test")
plt.ylabel("pred")
plt.show()














# 45도에 가까울 수록 정확한 것

plt.plot(range(0,len(y_test)),y_test,range(0,len(pred)),pred)
plt.show()














이 블로그의 인기 게시물