为什么使用scipy.nnls和sklearn.linear_models.LinearRegression会产生不同的结果？超级学习者问题

wb1gzix0 于 5个月前发布在其他

关注(0)|答案(1)|浏览(51)

我正在尝试用python实现我自己的超级学习者版本。下面是代码：

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn import neighbors
from sklearn import datasets
import matplotlib.pyplot as plt
from scipy import optimize
from pandas.plotting import scatter_matrix
import numpy as np
import pandas as pd 

class SuperLearner(BaseEstimator, RegressorMixin):
    
    def __init__(self, base_estimators):
        self.base_estimators = base_estimators
        self.meta_learner = linear_model.LinearRegression(positive=True)
        self.weights = None

        
    def rss(self, weights, X, y):
        y_pred = np.dot(X, weights)
        return np.sum((y - y_pred)**2)
    
    def constraint(self, weights):
        return np.sum(weights) - 1
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        
        meta_predictions = np.zeros((X.shape[0], len(self.base_estimators)), dtype=np.float64)
        #TODO: modify the number of folds depending on the number of base estimators and the size of the dataset
        kf = KFold(n_splits=5)        
        
        for i, (tran_idx, val_idx) in enumerate(kf.split(X)):
            X_train, X_val = X[tran_idx], X[val_idx]
            y_train, y_val = y[tran_idx], y[val_idx]
            for j, estimator in enumerate(self.base_estimators):
                estimator.fit(X_train, y_train)
                meta_predictions[val_idx, j] = estimator.predict(X_val)
        
        guess = np.empty(len(self.base_estimators))
        bounds = [(0,1)] * len(self.base_estimators)
        
        result = optimize.minimize(self.rss, guess, args=(meta_predictions, y), method='SLSQP', bounds=bounds, constraints={'type':'eq', 'fun':self.constraint})
        print(result.x, np.sum(result.x))
        result = optimize.nnls(meta_predictions, y)
        print(result[0], np.sum(result[0]))
        
        self.meta_learner.fit(meta_predictions, y)
        self.weights= self.meta_learner.coef_
        self.weights= self.weights / np.sum(self.weights)
        
        print(self.weights, np.sum(self.weights))
       
        
        return self
    
    def predict(self, X):
        check_is_fitted(self, 'meta_learner')
        X = check_array(X)
        
        base_predictions = np.zeros((X.shape[0], len(self.base_estimators)), dtype=np.float64)
        for i, estimator in enumerate(self.base_estimators):
            base_predictions[:, i] = estimator.predict(X)
            
        return np.dot(base_predictions, self.weights)

def main():
    np.random.seed(100)
    X, y = datasets.make_friedman1(1000)
    
    ols = linear_model.LinearRegression()
    elastic = linear_model.ElasticNetCV()
    ridge = linear_model.RidgeCV()
    lars = linear_model.LarsCV()
    lasso = linear_model.LassoCV()
    knn = neighbors.KNeighborsRegressor()
    
    superLeaner = SuperLearner([ols, elastic, ridge, lars, lasso, knn])
    
    superLeaner.fit(X, y)
    y_pred = superLeaner.predict(X)
    
    print("MSE: ", np.mean((y_pred - y)**2))
    
    
    
if __name__ == "__main__":
    main()

字符串
我使用三种不同的方法来评估每个模型在最终预测中应该具有的权重。虽然scipy.nnls和我使用scipy.minimize实现的优化方法产生了类似的结果，但sklearn的LinearRegression产生的结果完全不同。我甚至在GitHub上查看了LinearRegression的代码，它似乎调用了相同的scipy函数（scipy.nnls）当正参数被设置为true时，就像在这种情况下一样。有人知道为什么吗？

scipy

来源：https://stackoverflow.com/questions/76242313/why-using-the-scipy-nnls-and-the-sklearn-linear-models-linearregression-produces