我试图用逻辑回归计算一个简单的 cross_val_score,但我不明白为什么会出现错误:“数组索引太多”
关于数据框,有11个输入变量,都是dtype float64。有 2 个输出变量,quality (int) 和 cat_quality (string)(这是对质量的分类,只有两个可能的值:good / not_good)
有 1599 个观测值。
这是我的代码:
def split_input_output(X):
target = X[['cat_quality']]
X = X.drop(['quality','cat_quality'], axis=1) #quality is a non numerical variable
return X, target
X, y = split_input_output(df.copy())
lr = LogisticRegression()
cross_val_score(lr, X, y)
这是错误消息:
IndexErrorTraceback (most recent call last)
<ipython-input-107-f8517fdb5e22> in <module>()
8
9 lr = LogisticRegression()
---> 10 cross_val_score(lr, X, y)
11
12 #lr.fit(X, y)
C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.pyc in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
319 n_jobs=n_jobs, verbose=verbose,
320 fit_params=fit_params,
--> 321 pre_dispatch=pre_dispatch)
322 return cv_results['test_score']
323
C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.pyc in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
193 fit_params, return_train_score=return_train_score,
194 return_times=True)
--> 195 for train, test in cv.split(X, y, groups))
196
197 if return_train_score:
C:\Users\user\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\Users\user\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch_one_batch(self, iterator)
618
619 with self._lock:
--> 620 tasks = BatchedCalls(itertools.islice(iterator, batch_size))
621 if len(tasks) == 0:
622 # No more tasks available in the iterator: tell caller to stop.
C:\Users\user\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __init__(self, iterator_slice)
125
126 def __init__(self, iterator_slice):
--> 127 self.items = list(iterator_slice)
128 self._size = len(self.items)
129
C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.pyc in <genexpr>(***failed resolving arguments***)
189 pre_dispatch=pre_dispatch)
190 scores = parallel(
--> 191 delayed(_fit_and_score)(
192 clone(estimator), X, y, scorers, train, test, verbose, None,
193 fit_params, return_train_score=return_train_score,
C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_split.pyc in split(self, X, y, groups)
330 n_samples))
331
--> 332 for train, test in super(_BaseKFold, self).split(X, y, groups):
333 yield train, test
334
C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_split.pyc in split(self, X, y, groups)
93 X, y, groups = indexable(X, y, groups)
94 indices = np.arange(_num_samples(X))
---> 95 for test_index in self._iter_test_masks(X, y, groups):
96 train_index = indices[np.logical_not(test_index)]
97 test_index = indices[test_index]
C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_split.pyc in _iter_test_masks(self, X, y, groups)
624
625 def _iter_test_masks(self, X, y=None, groups=None):
--> 626 test_folds = self._make_test_folds(X, y)
627 for i in range(self.n_splits):
628 yield test_folds == i
C:\Users\user\Anaconda2\lib\site-packages\sklearn\model_selection\_split.pyc in _make_test_folds(self, X, y)
611 for test_fold_indices, per_cls_splits in enumerate(zip(*per_cls_cvs)):
612 for cls, (_, test_split) in zip(unique_y, per_cls_splits):
--> 613 cls_test_folds = test_folds[y == cls]
614 # the test split can be too big because we used
615 # KFold(...).split(X[:max(c, n_splits)]) when data is not 100%
IndexError: too many indices for array
谢谢您的回答。
编辑:下面的代码(我没有复制所有很长且不相关的数据可视化部分,因为我没有修改其中的数据)
from pylab import *
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import *
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.metrics import accuracy_score
# Data reading
df_red = pd.read_csv(r'C:\Users\user\path\winequality-red.csv', sep=";")
df_white = pd.read_csv(r'C:\Users\user\path\winequality-white.csv', sep=";")
# cat_quality
def categorization(X):
if X<=6:
cat = "not_great"
else:
cat = "great"
return cat
df_red["cat_quality"] = np.vectorize(categorization)(df_red["quality"])
df_white["cat_quality"] = np.vectorize(categorization)(df_white["quality"])
# Cross validation <<< error at the end of this part
def input_output(X):
target = X[['cat_quality']]
X = X.drop(['quality','cat_quality'], axis=1)
return X, target
from sklearn.model_selection import cross_val_score
def compute_score(clf, X, y) :
xval = cross_val_score(clf, X, y, cv = 5)
return xval
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
# df_red
X, y = input_output(df_red.copy())
compute_score(lr, X, y)
compute_score(lr, X, y).mean()