训练数据集和测验数据集
一般咱们拿到一批数据集不能全部喂给咱们的算法模型,需求保存一小部分留作测验咱们算的准确性和性能运用。咱们称“喂给”算法的数据集为“训练数据集”,测验运用的数据集为“测验数据集”
将鸢尾花的数据集切分为训练数据集和测验数据集
详细拆分规矩,随机选中20%个数据作为测验数据集,剩下80%作为训练数据集
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target
shuffle_index = np.random.permutation(len(Y))
test_ratio = 0.2
test_size = int(test_ratio * len(Y))
test_indexes = shuffle_index[:test_size]
train_index = shuffle_index[test_size:]
if __name__ == '__main__':
print(test_indexes)
print(train_index)
对切分训练数据集和测验数据集拆分算法进行封装
import numpy as np
def train_test_split(X, y, test_ratio=0.2, seed=None):
assert X.shape[0] == y.shape[0], "the size of X must be equal to the size of y"
assert 0.0 < test_ratio < 1.0, "the test_ratio range must in (0~1)"
if seed:
np.random.seed(seed)
shuffled_indexes = np.random.permutation(len(X))
test_size = int(test_ratio * len(y))
test_indexes = shuffled_indexes[test_size:]
train_indexes = shuffled_indexes[:test_size]
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
return X_train, X_test, y_train, y_test
验证算法的准确率
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from playML.model_selection import train_test_split
iris = datasets.load_iris()
X = iris.data
Y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, Y)
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train, y_train)
y_predict = knn_classifier.predict(X_test)
# 核算猜测准确率
res = (sum(y_predict == y_test) / len(y_test))
if __name__ == '__main__':
print(res)
#out: 0.9416666666666667
超参数
上面说到的K的值,一般需求算法人员的经验或许经过实验数据获取,并不是拍脑袋拍出来的,下面展示经过测验运用K的值在1~10之间获取最佳的算法准确度然后得到对应的K的值
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
if __name__ == '__main__':
digits = datasets.load_digits()
x = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
best_score = 0.0
best_k = -1
for k in range(1, 11):
knn_clf = KNeighborsClassifier(n_neighbors=k)
knn_clf.fit(X_train, y_train)
score = knn_clf.score(X_test, y_test)
if score > best_score:
best_k = k
best_score = score
print("best_k = ", best_k)
print("best_score", best_score)
# out:
# best_k = 3
# best_score 0.9972222222222222
权重参数
假设咱们设定K等于3,那么如下场景中,一共有三个点,因为少数服从多数原则,绿色的点被判定为蓝色。但是此刻赤色的点间隔是最近的,没有考虑每个参考点的权重信息,会在必定程度上形成误判。

1. 赤色权重核算 red = 1 * 1 / 1 = 1 2. 蓝色权重核算 blue = 1 * 1 / 3 + 1 * 1 / 4 = 7 / 12 red > blue 赤色取胜
核算权重参数和K参数
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
if __name__ == '__main__':
digits = datasets.load_digits()
x = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
best_score = 0.0
best_k = -1
best_mth = ''
for mth in ["distance", "uniform"]:
for k in range(1, 11):
knn_clf = KNeighborsClassifier(n_neighbors=k, weights=mth)
knn_clf.fit(X_train, y_train)
score = knn_clf.score(X_test, y_test)
if score > best_score:
best_k = k
best_score = score
best_mth = mth
print("best_k = ", best_k)
print("best_score=", best_score)
print("best_mth=", best_mth)
#out:
#best_k = 6
#best_score= 0.9944444444444445
#best_mth= uniform
名可夫斯基参数
欧拉间隔

曼哈顿间隔




核算权重参数和K参数和名可夫斯基参数
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
if __name__ == '__main__':
digits = datasets.load_digits()
x = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
best_score = 0.0
best_k = -1
best_mth = ''
best_p = 1
for p in [1, 2, 3, 4, 5]:
for mth in ["distance", "uniform"]:
for k in range(1, 11):
knn_clf = KNeighborsClassifier(n_neighbors=k, weights=mth, p=p)
knn_clf.fit(X_train, y_train)
score = knn_clf.score(X_test, y_test)
if score > best_score:
best_k = k
best_score = score
best_mth = mth
best_p = p
print("best_k = ", best_k)
print("best_score=", best_score)
print("best_mth=", best_mth)
print("best_p=", best_p)
#out:
#best_k = 6
#best_score= 0.9944444444444445
#best_mth= uniform
#best_p= 2
网格查找参数
上文中,经过不断地运用for循环的方法,确定最优参数的详细值。sklearn中提供了对应的网格参数查找。
- 界说网格查找参数的值
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
digits = datasets.load_digits()
x = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
grid_search = [
{
"n_neighbors": [i for i in range(1, 10)],
"weights": ["uniform"]
},
{
"n_neighbors": [i for i in range(1, 10)],
"weights": ["distance"],
"p": [i for i in range(1, 6)]
}
]
knn_clf = KNeighborsClassifier()
if __name__ == '__main__':
grid_search = GridSearchCV(knn_clf, grid_search)
print(grid_search.fit(X_train, y_train))
# 获取最优调参后的分类器
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)
# out:
#GridSearchCV(estimator=KNeighborsClassifier(),
# param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9],
# 'weights': ['uniform']},
# {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9],
# 'p': [1, 2, 3, 4, 5], 'weights': ['distance']}])
#KNeighborsClassifier(n_neighbors=7, p=4, weights='distance')
#0.9853948896631823
#{'n_neighbors': 7, 'p': 4, 'weights': 'distance'}
GridSearchCV中还提供了并行核算指定核算机核数的参数(n_jobs
)和在运转过程中输出一些运算信息的参数(verbose
)
if __name__ == '__main__':
grid_search = GridSearchCV(knn_clf, grid_search, n_jobs=6,verbose=2)
print(grid_search.fit(X_train, y_train))
# 获取最优调参后的分类器
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)
#out:
[CV] END ...............n_neighbors=9, p=3, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=3, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=4, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=4, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=4, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=8, p=5, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=4, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=4, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=5, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=5, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=5, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=5, weights=distance; total time= 0.3s
[CV] END ...............n_neighbors=9, p=5, weights=distance; total time= 0.3s
数据归一化
下面的这组癌症数据中,肿瘤大小和癌症数据有着很大的差异,大小是1和5,但是发现的天数却是100天和200天,这种数据在必定程度上,发现天数会严重左右最终的猜测结果。


将一切数据映射到0~1之间

最值归一化
找到最大特征点X_max和X_min,然后每个特征X减掉X_min除以X_max-X_min。这种算法适合有显着鸿沟的情况。如学生的成果,最低为0分,最高为100分。
对一个一维数组进行均值方差归一化
import numpy as np
if __name__ == '__main__':
x = np.random.random_integers(0, 100, size=100)
y = (x - np.min(x)) / (np.max(x) - np.min(x))
print(y)
#out:
[0.78787879 0.35353535 0.72727273 0.25252525 0.7979798 0.64646465
0.09090909 0.09090909 0.82828283 0.68686869 0.08080808 0.93939394
0.34343434 0.36363636 0.04040404 0.70707071 0.19191919 0.16161616
0.56565657 0.66666667 0.46464646 0.78787879 0.48484848 0.18181818
0.2020202 0.73737374 0.90909091 0.1010101 0.66666667 0.22222222
0.90909091 0.02020202 0.1010101 0.90909091 0. 0.01010101
0.29292929 0.52525253 0.49494949 0.96969697 0.77777778 0.94949495
0.11111111 0.87878788 0.48484848 0.92929293 0.49494949 0.12121212
0.73737374 0. 0.48484848 1. 0.15151515 0.56565657
0.25252525 0.50505051 0.84848485 0.31313131 0.95959596 0.64646465
0.37373737 0.41414141 0.37373737 0.42424242 0.29292929 0.67676768
0.78787879 0.05050505 0. 0.02020202 0.47474747 0.92929293
0.45454545 0.13131313 0.67676768 0.29292929 0.02020202 0.97979798
0.28282828 0.17171717 0.73737374 0.8989899 0.14141414 0.88888889
0.94949495 0.71717172 0.29292929 0.22222222 0.90909091 0.22222222
0.28282828 0.29292929 0.28282828 0.53535354 0.66666667 0.90909091
0.12121212 0.67676768 0.29292929 0.36363636]
均值方差归一化
如收入的散布,没有显着的鸿沟,那么这种算法就不适合。这种形式数据归一能够运用”均值方差归一化” 把一切的数据归一到均值为0方差为1的散布中

对一个二维数组的第0列进行均值方差归一化
x = np.random.random_integers(0, 100, (50, 2))
x = np.array(x, dtype="float")
x[:, 0] = (x[:, 0] - np.mean(x[:, 0])) / np.std(x[:, 0])
print(x[:, 0])
#out:
[-1.27374257 0.01286609 0.65617042 -0.23730782 0.97782258 1.62112691
-1.48817735 0.58469216 0.08434435 0.19156173 0.12008347 0.7633878
-0.59469911 1.08503997 -0.91635128 -0.13009043 -0.38026434 0.72764867
0.5132139 0.83486606 -1.77409038 -1.41669909 1.62112691 -0.0943513
-1.45243822 0.90634432 0.72764867 0.37025738 0.90634432 1.370953
1.19225736 1.19225736 0.26303999 -0.20156869 -1.63113387 -0.91635128
-1.63113387 0.04860522 0.97782258 1.62112691 0.54895303 -0.7019165
-0.98782954 0.37025738 0.83486606 -0.4517426 -1.84556864 -1.27374257
-0.16582956 -1.55965561]
特征值减掉均值除以特征值的方差。
声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。