from sklearn.datasets import make_classification
make_classification
x, y = make_classification(n_samples=100, n_features=5, noise=30, random_state=12) # random_state : 무작위 씨앗
x.shape # (100, 5) : 100행, 5열
y.shape # (100,) : 100행
make_regression
X, y = make_regression(n_samples=100, n_features=1, noise=30, random_state=1)
예제
with open('make_regression.csv','wt') as fout:
for i in range(0,100):
fout.write(f'{X[i][0]},{y[i]}\n')
df = pd.read_csv('make_regression.csv', header=None, names=['x','y'])
# x좌표 이상치
desc = df['x'].describe()
Q3 = desc['75%']
Q1 = desc['25%']
IQR = Q3 - Q1
max_lim = Q3 + IQR*1.5
min_lim = Q1 - IQR*1.5
display(f'max_lim: {max_lim}, min_lim: {min_lim}') # 'max_lim: 2.4779299273538546, min_lim: -2.32235040430704'
# y좌표 이상치
desc = df['y'].describe()
Q3 = desc['75%']
Q1 = desc['25%']
IQR = Q3 - Q1
max_lim = Q3 + IQR*1.5
min_lim = Q1 - IQR*1.5
display(f'max_lim: {max_lim}, min_lim: {min_lim}') # 'max_lim: 210.28350648185912, min_lim: -182.9504851764799'
# 이상치 검출
display((sum(df['x'] > max_lim), sum(df['x'] < min_lim))) # (0, 0)
#display(df[df['x'] > max_lim])
#display(df[df['x'] < min_lim])
display((sum(df['y'] > max_lim), sum(df['y'] < min_lim))) # (0, 2)
#display(df[df['y'] > max_lim])
display(df[df['y'] < min_lim])
x | y | |
---|---|---|
38 | -2.301539 | -195.983233 |
89 | -2.022201 | -218.352180 |
x | y | |
---|---|---|
81 | -2.060141 | -189.648236 |
# 이상치 제거
df.drop(index=[38,89], inplace=True)
df.drop(index=[81],inplace=True)
# df 확인
display(df)
x | y | |
---|---|---|
0 | -0.611756 | -58.620300 |
1 | -0.249370 | -5.929571 |
2 | 0.488518 | 14.421835 |
3 | 0.762011 | 120.893034 |
4 | 1.519817 | 155.173734 |
... | ... | ... |
95 | -0.298093 | -23.910414 |
96 | 1.659802 | 131.147185 |
97 | 0.043597 | 29.152012 |
98 | 0.042214 | 74.963268 |
99 | -0.191836 | -31.717486 |
97 rows × 2 columns
# boxplot 출력
plt.boxplot(df['x'])
plt.title('X 데이터 이상치 확인')
plt.show()
plt.boxplot(df['y'])
plt.title('X 데이터 이상치 확인')
plt.show()
make_blobs
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=300, n_features=2, centers=3, random_state=3)
# centers : 3 군데로 나뉘어 군집을 형성
예제
pd.DataFrame(list(zip(xy[:,0],xy[:,1],y)),columns=['x','y','label']).to_csv()
df = pd.read_csv('my_classification_dataset.csv')
display(df)
'''
x | y | label | |
---|---|---|---|
0 | 6.969050 | 6.344916 | 2 |
1 | 8.866481 | 6.564206 | 2 |
2 | 0.818543 | 5.937601 | 0 |
3 | -4.338424 | -2.055692 | 1 |
4 | 8.153471 | 7.272724 | 2 |
... | ... | ... | ... |
295 | 6.375622 | 8.063778 | 2 |
296 | 2.502106 | 4.399673 | 0 |
297 | 9.488051 | 9.369354 | 2 |
298 | -3.412940 | 1.582795 | 1 |
299 | -1.370027 | 0.468547 | 1 |
'''
plt.scatter(df['x'],df['y'],c=df['label'])
plt.show()
'''
'''