数据集的分割

数据集的分割

Hsinyan

2020 年 03 月 28 日

157 次浏览

暂无评论

2165字数

基础

导入测试数据集

这里使用scikit-learn自带的鸢尾花数据

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target
X.shape
# (150, 4)
y.shape
# (150,)

实现

查看y的时候发现，发现lable是按照从小到大进行排序的，所以取训练集不能直接取前n个为训练集，后n个为测试集，这样得到的模型肯定是不准确的。

这个时候可以使用permutation方法，获取到随机打乱的一组索引，之后自定义训练集和测试集的比例，这里设置测试集的比例为0.2，使用numpy的fancy indexing就可以切割得到完全随机的训练集和测试集。

# 获取随机打乱的索引
shuffle_indexes = np.random.permutation(len(X))
# array([106,  61, 127, 139, 128,   0, 108, 142,  92,  20, 113,  17,  11,
#        88, 140, 116,  18,  89, 104,  57,  38, 135, 131,  99, 133,  66,
#        40, 121,  86,  93, 134, 145,  39,  52,  98,  50,  90,  24,  51,
#         2, 120,  55,  75, 107,  67,  59,  36,  80, 119,  82, 143,  69,
#       137,  81,  14,  19,  64,  65,  28,  60,  77,   7, 105,  47,  83,
#       138, 109,   3,  71,  63,  31,  44, 102,  43, 146, 122, 110,  21,
#        79,  29,  12,  53, 136, 148,  58, 149,  42, 114,  41,  97,   4,
#       118,  85,  26,  96,  94,  48,  45,   8, 103,  16, 101, 130,  54,
#        91,  27,   9, 132, 129,  23,  95,  78, 124,  74,  46,  25,  62,
#       30,   6, 117, 115,  33, 111,  15,  76, 112,  72,  68,   5,  56,
#        49, 144, 125,  37,  32,  13,  70,  84,  10,   1, 123, 100,  22,
#        34, 147,  35,  73, 141, 126,  87])

# 设置测试集比例
test_ratio = 0.2
# 注意这里可能取到浮点数，需要强制取整
test_size = int(len(X)*test_ratio)

# 得到训练集和测试集的索引
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]

# 使用fancy indexing 即可得到训练集和测试集数据
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]

print(X_train.shape)
# (120, 4)
print(y_train.shape)
# (120,)

自定义train_test_split

import numpy as np


def train_test_split(X, y, test_ratio=0.2, seed=None):
    """将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
    assert X.shape[0] == y.shape[0], \
        "the size of X must be equal to the size of y"
    assert 0.0 <= test_ratio <= 1.0, \
        "test_ration must be valid"

    if seed:
        np.random.seed(seed)

    shuffled_indexes = np.random.permutation(len(X))

    test_size = int(len(X) * test_ratio)
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]

    X_train = X[train_indexes]
    y_train = y[train_indexes]
    X_test = X[test_indexes]
    y_test = y[test_indexes]

    return X_train, X_test, y_train, y_test

# 使用我们封装的算法
from model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

# 此处引用的是自己封装的knn算法，不是scikit-learn封装的
from kNN import KNNClassifier
my_knn_clf = KNNClassifier(k=3)
my_knn_clf.fit(X_train,y_train)

y_predict = my_knn_clf.predict(X_test)
# array([1, 2, 2, 0, 1, 2, 0, 2, 0, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 0, 2, 1,
#       2, 1, 0, 0, 1, 2, 2, 0])

y_test
# array([1, 2, 2, 0, 1, 2, 0, 2, 0, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 0, 2, 1,
#       2, 1, 0, 0, 1, 2, 2, 0])

# 得到y_predict和y_test之后需要对其进行比较，查看准确率
sum(y_predict == y_test)
# 30

# 准确率
sum(y_predict == y_test)/len(y_test)
# 1.0

使用`scikit-learn`中的`train_test_split`

scikit-learn中为我们封装好了分割数据集的方法，我们可以直接调用

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 666)
print(X_train.shape)
print(y_train.shape)
# (120, 4)
# (120,)
print(X_test.shape)
print(y_test.shape)
# (30, 4)
# (30,)

数据集的分割