1

我试图用逻辑回归计算一个简单的 cross_val_score,但我不明白为什么会出现错误:“数组索引太多”

关于数据框,有11个输入变量,都是dtype float64。有 2 个输出变量,quality (int) 和 cat_quality (string)(这是对质量的分类,只有两个可能的值:good / not_good)

有 1599 个观测值。

这是我的代码:

def split_input_output(X):
    target = X[['cat_quality']]
    X = X.drop(['quality','cat_quality'], axis=1) #quality is a non numerical variable
    return X, target

X, y = split_input_output(df.copy())
lr = LogisticRegression()
cross_val_score(lr, X, y)

这是错误消息:

IndexErrorTraceback (most recent call last)
<ipython-input-107-f8517fdb5e22> in <module>()
      8 
      9 lr = LogisticRegression()
---> 10 cross_val_score(lr, X, y)
     11 
     12 #lr.fit(X, y)

C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.pyc in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
    319                                 n_jobs=n_jobs, verbose=verbose,
    320                                 fit_params=fit_params,
--> 321                                 pre_dispatch=pre_dispatch)
    322     return cv_results['test_score']
    323 

C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.pyc in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
    193             fit_params, return_train_score=return_train_score,
    194             return_times=True)
--> 195         for train, test in cv.split(X, y, groups))
    196 
    197     if return_train_score:

C:\Users\user\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

C:\Users\user\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch_one_batch(self, iterator)
    618 
    619         with self._lock:
--> 620             tasks = BatchedCalls(itertools.islice(iterator, batch_size))
    621             if len(tasks) == 0:
    622                 # No more tasks available in the iterator: tell caller to stop.

C:\Users\user\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __init__(self, iterator_slice)
    125 
    126     def __init__(self, iterator_slice):
--> 127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 

C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.pyc in <genexpr>(***failed resolving arguments***)
    189                         pre_dispatch=pre_dispatch)
    190     scores = parallel(
--> 191         delayed(_fit_and_score)(
    192             clone(estimator), X, y, scorers, train, test, verbose, None,
    193             fit_params, return_train_score=return_train_score,

C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_split.pyc in split(self, X, y, groups)
    330                                                              n_samples))
    331 
--> 332         for train, test in super(_BaseKFold, self).split(X, y, groups):
    333             yield train, test
    334 

C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_split.pyc in split(self, X, y, groups)
     93         X, y, groups = indexable(X, y, groups)
     94         indices = np.arange(_num_samples(X))
---> 95         for test_index in self._iter_test_masks(X, y, groups):
     96             train_index = indices[np.logical_not(test_index)]
     97             test_index = indices[test_index]

C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_split.pyc in _iter_test_masks(self, X, y, groups)
    624 
    625     def _iter_test_masks(self, X, y=None, groups=None):
--> 626         test_folds = self._make_test_folds(X, y)
    627         for i in range(self.n_splits):
    628             yield test_folds == i

C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_split.pyc in _make_test_folds(self, X, y)
    611         for test_fold_indices, per_cls_splits in enumerate(zip(*per_cls_cvs)):
    612             for cls, (_, test_split) in zip(unique_y, per_cls_splits):
--> 613                 cls_test_folds = test_folds[y == cls]
    614                 # the test split can be too big because we used
    615                 # KFold(...).split(X[:max(c, n_splits)]) when data is not 100%

IndexError: too many indices for array

谢谢您的回答。

编辑:下面的代码(我没有复制所有很长且不相关的数据可视化部分,因为我没有修改其中的数据)

from pylab import *
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import *
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.metrics import accuracy_score

# Data reading
df_red = pd.read_csv(r'C:\Users\user\path\winequality-red.csv', sep=";")
df_white = pd.read_csv(r'C:\Users\user\path\winequality-white.csv', sep=";")

# cat_quality
def categorization(X):
    if X<=6:
        cat = "not_great"
    else:
        cat = "great"
    return cat

df_red["cat_quality"] = np.vectorize(categorization)(df_red["quality"])
df_white["cat_quality"] = np.vectorize(categorization)(df_white["quality"])

# Cross validation <<< error at the end of this part
def input_output(X):
    target = X[['cat_quality']]
    X = X.drop(['quality','cat_quality'], axis=1)
    return X, target


from sklearn.model_selection import cross_val_score
def compute_score(clf, X, y) :
    xval = cross_val_score(clf, X, y, cv = 5)
    return xval

from sklearn.grid_search import GridSearchCV

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

# df_red
X, y = input_output(df_red.copy())

compute_score(lr, X, y)
compute_score(lr, X, y).mean()
4

0 回答 0