包的导入以及图像上中文字体设置、图像清晰度设置

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['STfangsong']
plt.rcParams['axes.unicode_minus'] = False

%config InlineBackend.figure_format = 'svg'

常用函数

array1 = np.arange(1, 10).reshape(3, 3)
array1

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

array2 = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
array2

array([[1, 1, 1],
       [2, 2, 2],
       [3, 3, 3]])

# 水平方向拼接
np.hstack((array1, array2))

array([[1, 2, 3, 1, 1, 1],
       [4, 5, 6, 2, 2, 2],
       [7, 8, 9, 3, 3, 3]])

# 垂直方向拼接
np.vstack((array1, array2))

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9],
       [1, 1, 1],
       [2, 2, 2],
       [3, 3, 3]])

# 沿着指定的轴拼接
np.concatenate((array1, array2))

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9],
       [1, 1, 1],
       [2, 2, 2],
       [3, 3, 3]])

np.concatenate((array1, array2), axis=1)

array([[1, 2, 3, 1, 1, 1],
       [4, 5, 6, 2, 2, 2],
       [7, 8, 9, 3, 3, 3]])

# 垂直方向拆分
np.vsplit(array2, 3)

[array([[1, 1, 1]]), array([[2, 2, 2]]), array([[3, 3, 3]])]

# 水平方向拆分
np.hsplit(array2, 3)

[array([[1],
        [2],
        [3]]),
 array([[1],
        [2],
        [3]]),
 array([[1],
        [2],
        [3]])]

# 在末尾追加元素
np.append(array1, 10)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

# 在指定位置插入元素
np.insert(array1, 0, 0)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

array1[array1 % 3 == 0]

array([3, 6, 9])

# 根据条件筛选数据
np.extract(array1 % 3 == 0, array1)

array([3, 6, 9])

# 根据条件和公式获取数据
x = np.arange(10)
condlist = [x < 3, x > 5]
choicelist = [x, x ** 2]
np.select(condlist, choicelist, default=np.nan)

array([ 0.,  1.,  2., nan, nan, nan, 36., 49., 64., 81.])

# 根据条件和公式获取数据
np.where(x < 5, x, 10 * x)

array([ 0,  1,  2,  3,  4, 50, 60, 70, 80, 90])

def fib(n):
    a, b = 0, 1
    for _ in range(n):
        a, b = b, a + b
        yield a

        
gen = fib(20)
# 通过迭代器（生成器）创建数组对象
array3 = np.fromiter(gen, dtype=np.int64, count=10)
array3

array([ 1,  1,  2,  3,  5,  8, 13, 21, 34, 55], dtype=int64)

# 调整数组的大小
np.resize(array1, (4, 4))

array([[1, 2, 3, 4],
       [5, 6, 7, 8],
       [9, 1, 2, 3],
       [4, 5, 6, 7]])

向量

点积运算

A ⋅ B = a 1 b 1 + a 2 b 2 = ∣ A ∣ ∣ B ∣ c o s θ A \cdot B = a_1b_1 + a_2b_2 = \lvert A \rvert \lvert B \rvert cos \thetaA⋅B=a1b1+a2b2=∣A∣∣B∣cosθ

A ⋅ B = ∑ i = 1 n a i b i = ∣ A ∣ ∣ B ∣ c o s θ A \cdot B = \sum_{i=1}^{n} a_ib_i = \lvert A \rvert \lvert B \rvert cos \thetaA⋅B=i=1∑naibi=∣A∣∣B∣cosθ

v1 = np.array([3, 5])
v2 = np.array([1, 3])
# inner_prod = np.dot(v1, v2)
inner_prod = np.inner(v1, v2)
print('向量点积:', inner_prod)

向量点积: 18

说明：在欧几里得几何中，两个笛卡尔坐标向量的点积也称为内积（inner product），但是内积的含义要高于点积，点积相当于是内积在欧几里得空间 $ \mathbb{R}^n $ 的特例，而内积可以推广到赋范向量空间。

v1_norm = np.linalg.norm(v1)
v2_norm = np.linalg.norm(v2)
print('v1的模:', np.round(v1_norm, 6))
print('v2的模:', np.round(v2_norm, 6))

v1的模: 5.830952
v2的模: 3.162278

cos_theta = inner_prod / (v1_norm * v2_norm)
print('向量夹角余弦值:', cos_theta)
print('夹角:', np.arccos(cos_theta) * 180 / np.pi)

向量夹角余弦值: 0.9761870601839526
夹角: 12.52880770915155

行列式

d e t ( A ) = ∑ n ! ± a 1 α a 2 β a 3 γ ⋯ a n ω det(A) = \sum_{n!} \pm a_{1\alpha}a_{2\beta}a_{3\gamma} \cdots a_{n\omega}det(A)=n!∑±a1αa2βa3γ⋯anω

array4 = np.stack((v1, v2))
array4

array([[3, 5],
       [1, 3]])

d e t ∣ 3 5 1 3 ∣ = 4 det \begin{vmatrix} 3 & 5 \ 1 & 3 \end{vmatrix} = 4det∣∣∣∣3153∣∣∣∣=4

# 计算行列式的值
np.round(np.linalg.det(array4), 2)

4.0

d e t ∣ 1 2 3 4 5 6 7 8 9 ∣ = 0 det \begin{vmatrix} 1 & 2 & 3 \ 4 & 5 & 6 \ 7 & 8 & 9 \end{vmatrix} = 0det∣∣∣∣∣∣147258369∣∣∣∣∣∣=0

np.linalg.det(array1)

0.0

矩阵

array1 = np.arange(1, 10).reshape((3, 3))
array1

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

对上面的array1做一组线性变换，就知道为什么它的秩是2了。

∣ 1 2 3 4 5 6 7 8 9 ∣ → ∣ 1 2 3 0 − 3 − 6 0 − 6 − 12 ∣ \begin{vmatrix} 1 & 2 & 3\ 4 & 5 & 6\ 7 & 8 & 9 \end{vmatrix} \quad \to \quad \begin{vmatrix} 1 & 2 & 3\ 0 & -3 & -6\ 0 & -6 & -12 \end{vmatrix}∣∣∣∣∣∣147258369∣∣∣∣∣∣→∣∣∣∣∣∣1002−3−63−6−12∣∣∣∣∣∣

# 求逆矩阵
# LinAlgError ---> Singluar matrix ---> 奇异矩阵不能求逆
# np.linalg.inv(array1)

array2 = np.array([[1, 2], [3, 4]])
array2

array([[1, 2],
       [3, 4]])

array3 = np.linalg.inv(array2)
array3

array([[-2. ,  1. ],
       [ 1.5, -0.5]])

A ⋅ A − 1 = I A \cdot A^{-1} = IA⋅A−1=I

np.round(array2 @ array3, 2)

array([[1., 0.],
       [0., 1.]])

# 求矩阵的秩
np.linalg.matrix_rank(array1)

array1[2, 2] = 8
array1

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 8]])

np.linalg.matrix_rank(array1)

解线性方程：

{ 3 x + y = 9 x + 2 y = 8 \begin{cases} 3x + y = 9 \ x + 2y = 8 \end{cases}{3x+y=9x+2y=8

A = np.array([[3, 1], [1, 2]])
b = np.array([9, 8]).reshape(-1, 1)

np.linalg.solve(A, b)

array([[2.],
       [3.]])

A x = b A − 1 A x = A − 1 b I x = A − 1 b Ax = b\ A^{-1}Ax = A^{-1}b\ Ix = A^{-1}bAx=bA−1Ax=A−1bIx=A−1b

A_1 = np.linalg.inv(A)
A_1

array([[ 0.4, -0.2],
       [-0.2,  0.6]])

A_1 @ b

array([[2.],
       [3.]])

最小二乘解

!pip install scikit-learn

Looking in indexes: https://pypi.doubanio.com/simple
Requirement already satisfied: scikit-learn in d:\programs\python\python38\lib\site-packages (0.24.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (2.2.0)
Requirement already satisfied: numpy>=1.13.3 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.21.2)
Requirement already satisfied: joblib>=0.11 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.0.1)
Requirement already satisfied: scipy>=0.19.1 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.7.1)

from sklearn.datasets import load_boston

# 获取波士顿房价数据
dataset = load_boston()
print(dataset.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of black people by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/

This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.

dataset.data.shape

(506, 13)

dataset.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

# 用波士顿房价数据创建DataFrame对象
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.0	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.0	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.0	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.0	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.0	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33
...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	0.06263	0.0	11.93	0.0	0.573	6.593	69.1	2.4786	1.0	273.0	21.0	391.99	9.67
502	0.04527	0.0	11.93	0.0	0.573	6.120	76.7	2.2875	1.0	273.0	21.0	396.90	9.08
503	0.06076	0.0	11.93	0.0	0.573	6.976	91.0	2.1675	1.0	273.0	21.0	396.90	5.64
504	0.10959	0.0	11.93	0.0	0.573	6.794	89.3	2.3889	1.0	273.0	21.0	393.45	6.48
505	0.04741	0.0	11.93	0.0	0.573	6.030	80.8	2.5050	1.0	273.0	21.0	396.90	7.88

506 rows × 13 columns

# 添加房价列
df['PRICE'] = dataset.target
df

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	PRICE
0	0.00632	18.0	2.31	0.0	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.0	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.0	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.0	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.0	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33	36.2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	0.06263	0.0	11.93	0.0	0.573	6.593	69.1	2.4786	1.0	273.0	21.0	391.99	9.67	22.4
502	0.04527	0.0	11.93	0.0	0.573	6.120	76.7	2.2875	1.0	273.0	21.0	396.90	9.08	20.6
503	0.06076	0.0	11.93	0.0	0.573	6.976	91.0	2.1675	1.0	273.0	21.0	396.90	5.64	23.9
504	0.10959	0.0	11.93	0.0	0.573	6.794	89.3	2.3889	1.0	273.0	21.0	393.45	6.48	22.0
505	0.04741	0.0	11.93	0.0	0.573	6.030	80.8	2.5050	1.0	273.0	21.0	396.90	7.88	11.9

506 rows × 14 columns

# 计算协方差
df.cov()

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	PRICE
CRIM	73.986578	-40.215956	23.992339	-0.122109	0.419594	-1.325038	85.405322	-6.876722	46.847761	844.821538	5.399331	-302.381816	27.986168	-30.718508
ZN	-40.215956	543.936814	-85.412648	-0.252925	-1.396148	5.112513	-373.901548	32.629304	-63.348695	-1236.453735	-19.776571	373.721402	-68.783037	77.315176
INDUS	23.992339	-85.412648	47.064442	0.109669	0.607074	-1.887957	124.513903	-10.228097	35.549971	833.360290	5.692104	-223.579756	29.580270	-30.520823
CHAS	-0.122109	-0.252925	0.109669	0.064513	0.002684	0.016285	0.618571	-0.053043	-0.016296	-1.523367	-0.066819	1.131325	-0.097816	0.409409
NOX	0.419594	-1.396148	0.607074	0.002684	0.013428	-0.024603	2.385927	-0.187696	0.616929	13.046286	0.047397	-4.020570	0.488946	-0.455412
RM	-1.325038	5.112513	-1.887957	0.016285	-0.024603	0.493671	-4.751929	0.303663	-1.283815	-34.583448	-0.540763	8.215006	-3.079741	4.493446
AGE	85.405322	-373.901548	124.513903	0.618571	2.385927	-4.751929	792.358399	-44.329379	111.770846	2402.690122	15.936921	-702.940328	121.077725	-97.589017
DIS	-6.876722	32.629304	-10.228097	-0.053043	-0.187696	0.303663	-44.329379	4.434015	-9.068252	-189.664592	-1.059775	56.040356	-7.473329	4.840229
RAD	46.847761	-63.348695	35.549971	-0.016296	0.616929	-1.283815	111.770846	-9.068252	75.816366	1335.756577	8.760716	-353.276219	30.385442	-30.561228
TAX	844.821538	-1236.453735	833.360290	-1.523367	13.046286	-34.583448	2402.690122	-189.664592	1335.756577	28404.759488	168.153141	-6797.911215	654.714520	-726.255716
PTRATIO	5.399331	-19.776571	5.692104	-0.066819	0.047397	-0.540763	15.936921	-1.059775	8.760716	168.153141	4.686989	-35.059527	5.782729	-10.110657
B	-302.381816	373.721402	-223.579756	1.131325	-4.020570	8.215006	-702.940328	56.040356	-353.276219	-6797.911215	-35.059527	8334.752263	-238.667516	279.989834
LSTAT	27.986168	-68.783037	29.580270	-0.097816	0.488946	-3.079741	121.077725	-7.473329	30.385442	654.714520	5.782729	-238.667516	50.994760	-48.447538
PRICE	-30.718508	77.315176	-30.520823	0.409409	-0.455412	4.493446	-97.589017	4.840229	-30.561228	-726.255716	-10.110657	279.989834	-48.447538	84.586724

# 计算皮尔逊相关系数
np.round(df.corr(), 2)

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	PRICE
CRIM	1.00	-0.20	0.41	-0.06	0.42	-0.22	0.35	-0.38	0.63	0.58	0.29	-0.39	0.46	-0.39
ZN	-0.20	1.00	-0.53	-0.04	-0.52	0.31	-0.57	0.66	-0.31	-0.31	-0.39	0.18	-0.41	0.36
INDUS	0.41	-0.53	1.00	0.06	0.76	-0.39	0.64	-0.71	0.60	0.72	0.38	-0.36	0.60	-0.48
CHAS	-0.06	-0.04	0.06	1.00	0.09	0.09	0.09	-0.10	-0.01	-0.04	-0.12	0.05	-0.05	0.18
NOX	0.42	-0.52	0.76	0.09	1.00	-0.30	0.73	-0.77	0.61	0.67	0.19	-0.38	0.59	-0.43
RM	-0.22	0.31	-0.39	0.09	-0.30	1.00	-0.24	0.21	-0.21	-0.29	-0.36	0.13	-0.61	0.70
AGE	0.35	-0.57	0.64	0.09	0.73	-0.24	1.00	-0.75	0.46	0.51	0.26	-0.27	0.60	-0.38
DIS	-0.38	0.66	-0.71	-0.10	-0.77	0.21	-0.75	1.00	-0.49	-0.53	-0.23	0.29	-0.50	0.25
RAD	0.63	-0.31	0.60	-0.01	0.61	-0.21	0.46	-0.49	1.00	0.91	0.46	-0.44	0.49	-0.38
TAX	0.58	-0.31	0.72	-0.04	0.67	-0.29	0.51	-0.53	0.91	1.00	0.46	-0.44	0.54	-0.47
PTRATIO	0.29	-0.39	0.38	-0.12	0.19	-0.36	0.26	-0.23	0.46	0.46	1.00	-0.18	0.37	-0.51
B	-0.39	0.18	-0.36	0.05	-0.38	0.13	-0.27	0.29	-0.44	-0.44	-0.18	1.00	-0.37	0.33
LSTAT	0.46	-0.41	0.60	-0.05	0.59	-0.61	0.60	-0.50	0.49	0.54	0.37	-0.37	1.00	-0.74
PRICE	-0.39	0.36	-0.48	0.18	-0.43	0.70	-0.38	0.25	-0.38	-0.47	-0.51	0.33	-0.74	1.00

rooms = df['RM'].values
prices = df['PRICE'].values
history_data = {room: price for room, price in zip(rooms, prices)}
history_data

{6.575: 24.0,
 6.421: 21.6,
 7.185: 34.9,
 6.998: 33.4,
 7.147: 36.2,
 6.43: 28.7,
 6.012: 22.9,
 6.172: 27.1,
 5.631: 16.5,
 6.004: 20.3,
 6.377: 15.0,
 6.009: 21.7,
 5.889: 21.7,
 5.949: 20.4,
 6.096: 13.5,
 5.834: 19.9,
 5.935: 8.4,
 5.99: 17.5,
 5.456: 20.2,
 5.727: 18.2,
 5.57: 13.6,
 5.965: 19.6,
 6.142: 15.2,
 5.813: 16.6,
 5.924: 15.6,
 5.599: 13.9,
 6.047: 14.8,
 6.495: 26.4,
 6.674: 21.0,
 5.713: 20.1,
 6.072: 14.5,
 5.95: 13.2,
 5.701: 13.1,
 5.933: 18.9,
 5.841: 20.0,
 5.85: 21.0,
 5.966: 16.0,
 6.595: 30.8,
 7.024: 34.9,
 6.77: 26.6,
 6.169: 25.3,
 6.211: 25.0,
 6.069: 21.2,
 5.682: 19.3,
 5.786: 20.0,
 6.03: 11.9,
 5.399: 14.4,
 5.602: 19.4,
 5.963: 19.7,
 6.115: 20.5,
 6.511: 25.0,
 5.998: 23.4,
 5.888: 23.3,
 7.249: 35.4,
 6.383: 24.7,
 6.816: 31.6,
 6.145: 23.3,
 5.927: 19.6,
 5.741: 18.7,
 6.456: 22.2,
 6.762: 25.0,
 7.104: 33.0,
 6.29: 23.5,
 5.787: 19.4,
 5.878: 22.0,
 5.594: 17.4,
 5.885: 20.9,
 6.417: 13.0,
 5.961: 20.5,
 6.065: 22.8,
 6.245: 23.4,
 6.273: 24.1,
 6.286: 21.4,
 6.279: 20.0,
 6.14: 20.8,
 6.232: 21.2,
 5.874: 20.3,
 6.727: 27.5,
 6.619: 23.9,
 6.302: 24.8,
 6.167: 19.9,
 6.389: 23.9,
 6.63: 27.9,
 6.015: 22.5,
 6.121: 22.2,
 7.007: 23.6,
 7.079: 28.7,
 6.405: 12.5,
 6.442: 22.9,
 6.249: 20.6,
 6.625: 28.4,
 6.163: 21.4,
 8.069: 38.7,
 7.82: 45.4,
 7.416: 33.2,
 6.781: 26.5,
 6.137: 19.3,
 5.851: 19.5,
 5.836: 19.5,
 6.127: 22.7,
 6.474: 19.8,
 6.229: 21.4,
 6.195: 21.7,
 6.715: 22.8,
 5.913: 18.8,
 6.092: 18.7,
 6.254: 18.5,
 5.928: 18.3,
 6.176: 21.2,
 6.021: 19.2,
 5.872: 20.4,
 5.731: 19.3,
 5.87: 22.0,
 5.856: 21.1,
 5.879: 18.8,
 5.986: 21.4,
 5.613: 15.7,
 5.693: 16.2,
 6.431: 24.6,
 5.637: 14.3,
 6.458: 19.2,
 6.326: 24.4,
 6.372: 23.0,
 5.822: 18.4,
 5.757: 15.0,
 6.335: 18.1,
 5.942: 17.4,
 6.454: 17.1,
 5.857: 13.3,
 6.151: 17.8,
 6.174: 14.0,
 5.019: 14.4,
 5.403: 13.4,
 5.468: 15.6,
 4.903: 11.8,
 6.13: 13.8,
 5.628: 15.6,
 4.926: 14.6,
 5.186: 17.8,
 5.597: 15.4,
 6.122: 22.1,
 5.404: 19.3,
 5.012: 15.3,
 5.709: 19.4,
 6.129: 17.0,
 6.152: 8.7,
 5.272: 13.1,
 6.943: 41.3,
 6.066: 24.3,
 6.51: 23.3,
 6.25: 27.0,
 7.489: 50.0,
 7.802: 50.0,
 8.375: 50.0,
 5.854: 10.8,
 6.101: 25.0,
 7.929: 50.0,
 5.877: 23.8,
 6.319: 23.8,
 6.402: 22.3,
 5.875: 50.0,
 5.88: 19.1,
 5.572: 23.1,
 6.416: 23.6,
 5.859: 22.6,
 6.546: 29.4,
 6.02: 23.2,
 6.315: 22.3,
 6.86: 29.9,
 6.98: 29.8,
 7.765: 39.8,
 6.144: 19.8,
 7.155: 37.9,
 6.563: 32.5,
 5.604: 26.4,
 6.153: 29.6,
 7.831: 50.0,
 6.782: 7.5,
 6.556: 29.8,
 6.951: 26.7,
 6.739: 30.5,
 7.178: 36.4,
 6.8: 31.1,
 6.604: 29.1,
 7.875: 50.0,
 7.287: 33.3,
 7.107: 30.3,
 7.274: 34.6,
 6.975: 34.9,
 7.135: 32.9,
 6.162: 13.3,
 7.61: 42.3,
 7.853: 48.5,
 8.034: 50.0,
 5.891: 22.6,
 5.783: 22.5,
 6.064: 24.4,
 5.344: 20.0,
 5.96: 21.7,
 5.807: 22.4,
 6.375: 28.1,
 5.412: 23.7,
 6.182: 25.0,
 6.642: 28.7,
 5.951: 21.5,
 6.373: 23.0,
 6.164: 21.7,
 6.879: 27.5,
 6.618: 30.1,
 8.266: 44.8,
 8.725: 50.0,
 8.04: 37.6,
 7.163: 31.6,
 7.686: 46.7,
 6.552: 31.5,
 5.981: 24.3,
 7.412: 31.7,
 8.337: 41.7,
 8.247: 48.3,
 6.726: 29.0,
 6.086: 24.0,
 6.631: 25.1,
 7.358: 31.5,
 6.481: 23.7,
 6.606: 23.3,
 6.897: 22.0,
 6.095: 20.1,
 6.358: 22.2,
 6.393: 23.7,
 5.593: 17.6,
 5.605: 18.5,
 6.108: 21.9,
 6.226: 20.5,
 6.433: 24.5,
 6.718: 26.2,
 6.487: 24.4,
 6.438: 24.8,
 6.957: 29.6,
 8.259: 42.8,
 5.876: 20.9,
 7.454: 44.0,
 8.704: 50.0,
 7.333: 36.0,
 6.842: 30.1,
 7.203: 33.8,
 7.52: 43.1,
 8.398: 48.8,
 7.327: 31.0,
 7.206: 36.5,
 5.56: 22.8,
 7.014: 30.7,
 8.297: 50.0,
 7.47: 43.5,
 5.92: 20.7,
 6.24: 25.2,
 6.538: 24.4,
 7.691: 35.2,
 6.758: 32.4,
 6.854: 32.0,
 7.267: 33.2,
 6.826: 33.1,
 6.482: 29.1,
 6.812: 35.1,
 6.968: 10.4,
 7.645: 46.0,
 7.923: 50.0,
 7.088: 32.2,
 6.453: 22.0,
 6.23: 20.1,
 6.209: 21.4,
 6.565: 24.8,
 6.861: 28.5,
 7.148: 37.3,
 6.678: 28.6,
 6.549: 27.1,
 5.79: 20.3,
 6.345: 22.5,
 7.041: 29.0,
 6.871: 24.8,
 6.59: 22.0,
 6.982: 33.1,
 7.236: 36.1,
 6.616: 28.4,
 7.42: 33.4,
 6.849: 28.2,
 6.635: 24.5,
 5.972: 20.3,
 4.973: 16.1,
 6.023: 19.4,
 6.266: 21.6,
 6.567: 23.8,
 5.705: 16.2,
 5.914: 17.8,
 5.782: 19.8,
 6.382: 23.1,
 6.113: 21.0,
 6.426: 23.8,
 6.376: 17.7,
 6.041: 20.4,
 5.708: 18.5,
 6.415: 25.0,
 6.312: 21.2,
 6.083: 22.2,
 5.868: 19.3,
 6.333: 22.6,
 5.706: 17.1,
 6.031: 19.4,
 6.316: 22.2,
 6.31: 20.7,
 6.037: 21.1,
 5.869: 19.5,
 5.895: 18.5,
 6.059: 20.6,
 5.985: 19.0,
 5.968: 18.7,
 7.241: 32.7,
 6.54: 16.5,
 6.696: 23.9,
 6.874: 31.2,
 6.014: 17.5,
 5.898: 17.2,
 6.516: 23.1,
 6.939: 26.6,
 6.49: 22.9,
 6.579: 24.1,
 5.884: 18.6,
 6.728: 14.9,
 5.663: 18.2,
 5.936: 13.5,
 6.212: 17.8,
 6.395: 21.7,
 6.112: 22.6,
 6.398: 25.0,
 6.251: 12.6,
 5.362: 20.8,
 5.803: 16.8,
 8.78: 21.9,
 3.561: 27.5,
 4.963: 21.9,
 3.863: 23.1,
 4.97: 50.0,
 6.683: 50.0,
 7.016: 50.0,
 6.216: 50.0,
 4.906: 13.8,
 4.138: 11.9,
 7.313: 15.0,
 6.649: 13.9,
 6.794: 22.0,
 6.38: 9.5,
 6.223: 10.2,
 6.545: 10.9,
 5.536: 11.3,
 5.52: 12.3,
 4.368: 8.8,
 5.277: 7.2,
 4.652: 10.5,
 5.0: 7.4,
 4.88: 10.2,
 5.39: 19.7,
 6.051: 23.2,
 5.036: 9.7,
 6.193: 11.0,
 5.887: 12.7,
 6.471: 13.1,
 5.747: 8.5,
 5.453: 5.0,
 5.852: 6.3,
 5.987: 5.6,
 6.343: 7.2,
 6.404: 12.1,
 5.349: 8.3,
 5.531: 8.5,
 5.683: 5.0,
 5.608: 27.9,
 5.617: 17.2,
 6.852: 27.5,
 6.657: 17.2,
 4.628: 17.9,
 5.155: 16.3,
 4.519: 7.0,
 6.434: 7.2,
 5.304: 12.0,
 5.957: 8.8,
 6.824: 8.4,
 6.411: 16.7,
 6.006: 14.2,
 5.648: 20.8,
 6.103: 13.4,
 5.565: 11.7,
 5.896: 8.3,
 5.837: 10.2,
 6.202: 10.9,
 6.348: 14.5,
 6.833: 14.1,
 6.425: 16.1,
 6.436: 14.3,
 6.208: 11.7,
 6.629: 13.4,
 6.461: 9.6,
 5.627: 12.8,
 5.818: 10.5,
 6.406: 17.1,
 6.219: 18.4,
 6.485: 15.4,
 6.459: 11.8,
 6.341: 14.9,
 6.185: 14.6,
 6.749: 13.4,
 6.655: 15.2,
 6.297: 16.1,
 7.393: 17.8,
 6.525: 14.1,
 5.976: 12.7,
 6.301: 14.9,
 6.081: 20.0,
 6.701: 16.4,
 6.317: 19.5,
 6.513: 20.2,
 5.759: 19.9,
 5.952: 19.0,
 6.003: 19.1,
 5.926: 24.5,
 6.437: 23.2,
 5.427: 13.8,
 6.484: 16.7,
 6.242: 23.0,
 6.75: 23.7,
 7.061: 25.0,
 5.762: 21.8,
 5.871: 20.6,
 6.114: 19.1,
 5.905: 20.6,
 5.454: 15.2,
 5.414: 7.0,
 5.093: 8.1,
 5.983: 20.1,
 5.707: 21.8,
 5.67: 23.1,
 5.794: 18.3,
 6.019: 21.2,
 5.569: 17.5,
 6.027: 16.8,
 6.593: 22.4,
 6.12: 20.6,
 6.976: 23.9}

通过计算皮尔逊相关系数，发现房间数和房价存在正相关，接下来我们通过学习历史数据，最终实现用房间数预测房价的目标。

import heapq

nums = [35, 98, 76, 12, 55, 68, 47, 92]
print(heapq.nlargest(3, nums))
print(heapq.nsmallest(3, nums))

[98, 92, 76]
[12, 35, 47]

import heapq

# kNN算法
def predict_price_by_knn(history_data, room, k=5):
    # keys = sorted(history_data, key=lambda x: (x - room) ** 2)[:k]
    keys = heapq.nsmallest(k, history_data, key=lambda x: (x - room) ** 2)
    return np.mean([history_data[key] for key in keys])

# 预测房价
np.round(predict_price_by_knn(history_data, 6.25), 2)

20.42

np.round(predict_price_by_knn(history_data, 5.125), 2)

13.26

# 通过散点图研究变量的关系
plt.scatter(rooms, prices)
plt.show()

通过上面的图，我们发现房间数和房价呈现出线性关系，接下来我们尝试用一个线性函数来实现对房价的预测。

损失函数

回归方程：x xx 代表房间数，y yy 就是要预测的房价。
y = a x + b y = ax + by=ax+b

现在我们的问题是找到一组a和b，让预测达到最佳的效果（误差最小就是最佳）。

均方误差：让均方误差最小的 a aa 和 b bb 就是最佳拟合。
M S E = 1 N ∑ ( y i ^ − y i ) 2 MSE = \frac{1} {N} \sum (\hat{y_i} - y_i)^2MSE=N1∑(yi^−yi)2

def get_loss(x, y, a, b):
    """损失函数"""
    y_hat = a * x + b
    return np.mean((y_hat - y) ** 2)

# 通过蒙特卡罗模拟找到实现最佳拟合的a和b的值
import random

best_a, best_b = None, None
min_loss = np.inf

for _ in range(1000):
    # 随机产生a和b的值
    a = random.random() * 200 - 100
    b = random.random() * 200 - 100
    # 计算损失（MSE）
    curr_loss = get_loss(rooms, prices, a, b)
    # 让损失更小的a和b就是更好的拟合
    if curr_loss < min_loss:
        min_loss = curr_loss
        best_a, best_b = a, b
print(best_a, best_b)
print(min_loss)

12.414266461017732 -56.48722240398021
50.00741553150247

梯度下降

损失函数是凹函数，找到使函数最小的a和b的值，可以用下面的方法：

a ′ = a + ( − 1 ) × ∂ l o s s ( a , b ) ∂ a × Δ a^\prime = a + (-1) \times \frac {\partial loss(a, b)} {\partial a} \times \Deltaa′=a+(−1)×∂a∂loss(a,b)×Δ
b ′ = b + ( − 1 ) × ∂ l o s s ( a , b ) ∂ b × Δ b^\prime = b + (-1) \times \frac {\partial loss(a, b)} {\partial b} \times \Deltab′=b+(−1)×∂b∂loss(a,b)×Δ

对于求MSE的损失函数来说，可以用下面的公式计算偏导数：

f ( a , b ) = 1 N ∑ i = 1 N ( y i − ( a x i + b ) ) 2 f(a, b) = \frac {1} {N} \sum_{i=1}^{N}(y_i - (ax_i + b))^2f(a,b)=N1i=1∑N(yi−(axi+b))2
∂ f ( a , b ) ∂ a = 2 N ∑ i = 1 N ( − x i y i + x i 2 a + x i b ) \frac {\partial {f(a, b)}} {\partial {a}} = \frac {2} {N} \sum_{i=1}^{N}(-x_iy_i + x_i^2a + x_ib)∂a∂f(a,b)=N2i=1∑N(−xiyi+xi2a+xib)
∂ f ( a , b ) ∂ b = 2 N ∑ i = 1 N ( − y i + x i a + b ) \frac {\partial {f(a, b)}} {\partial {b}} = \frac {2} {N} \sum_{i=1}^{N}(-y_i + x_ia + b)∂b∂f(a,b)=N2i=1∑N(−yi+xia+b)

# 求a的偏导数
def partial_a(x, y, a, b):
    return 2 * np.mean((y - a * x - b) * (-x))

# 求b的偏导数
def partial_b(x, y, a, b):
    return 2 * np.mean(-y + a * x + b)

# 通过梯度下降的方式向拐点逼近
# 这种方式能够更快的找到最佳拟合的a和b
# a和b的初始值可以随意设定，delta的值要足够小
a, b = 35, -35
delta = 0.01

for _ in range(100):
    a = a - partial_a(rooms, prices, a, b) * delta
    b = b - partial_b(rooms, prices, a, b) * delta
print(a, b)
print(get_loss(rooms, prices, a, b))

9.276809660789766 -35.781905844032686
43.61576735104159

# 通过线性回归方程预测房价
def predict_price_by_regression(a, b, x):
    return a * x + b

# 预测房价
print(np.round(predict_price_by_regression(best_a, best_b, 6.25), 2))
print(np.round(predict_price_by_regression(a, b, 6.25), 2))

21.1
22.2

print(np.round(predict_price_by_regression(best_a, best_b, 5.12), 2))
print(np.round(predict_price_by_regression(a, b, 5.12), 2))

7.07
11.72

# 比较两条拟合曲线
y_hat1 = best_a * rooms + best_b
y_hat2 = a * rooms + b
plt.scatter(rooms, prices)
plt.plot(rooms, y_hat1, color='red', linewidth=4)
plt.plot(rooms, y_hat2, color='green', linewidth=4)
plt.show()

最小二乘解就是用已经得到的历史数据（x和y的值）找到能够最佳拟合这些历史数据的a和b。

y = a x + b y = ax + by=ax+b

对于上面的方程，相当于x是变量a的系数，1是变量b的系数。

`lstsq`函数参数说明

lstsq函数的第一个参数是$ \begin{bmatrix} x \ 1 \ \end{bmatrix} ^T $，第二个参数就是y，rcond参数暂时不管，直接设置为None。

# lstsq函数的第一个参数
param1 = np.vstack([rooms, np.ones(rooms.size)]).T
param1

array([[6.575, 1.   ],
       [6.421, 1.   ],
       [7.185, 1.   ],
       ...,
       [6.976, 1.   ],
       [6.794, 1.   ],
       [6.03 , 1.   ]])

# lstsq函数的第二个参数
param2 = prices
param2

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
       17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
       25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
       23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
       32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
       34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
       20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
       26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
       31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
       22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
       42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
       36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
       32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
       20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
       20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
       22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
       21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
       19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
       32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
       18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
       16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
       13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
        7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
       12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
       27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
        8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
        9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
       10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
       15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
       19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
       29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
       20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
       23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])

`lstsq`函数返回值说明

lstsq函数返回的是一个四元组，四元组中的第一个元素就是要求解的方程的系数，四元组中的第二个元素是误差平方和。

# rcond参数直接设置为None（暂不解释）
result = np.linalg.lstsq(param1, param2, rcond=None)
result

(array([  9.10210898, -34.67062078]),
 array([22061.87919621]),
 2,
 array([143.99484122,   2.46656609]))

a, b = result[0]
mse = result[1][0] / rooms.size
print(a, b)
print(mse)

9.102108981180313 -34.67062077643857
43.600551771169584

# 比较两条拟合曲线
plt.scatter(rooms, prices)
# 梯度下降法给出的a和b预测出的房价
plt.plot(rooms, y_hat2, color='red', linewidth=4)
# lstsq函数给出的a和b预测出的房价
y_hat3 = a * rooms + b
plt.plot(rooms, y_hat3, color='green', linewidth=4)
plt.show()

NumPy的高级应用(example)

常用函数

向量

点积运算

行列式

矩阵

最小二乘解

损失函数

梯度下降

`lstsq`函数参数说明

`lstsq`函数返回值说明

相关文章

热门标签

最新文章

目录

NumPy的高级应用(example)

常用函数

向量

点积运算

行列式

矩阵

最小二乘解

损失函数

梯度下降

lstsq函数参数说明

lstsq函数返回值说明

相关文章

热门标签

最新文章

目录

`lstsq`函数参数说明

`lstsq`函数返回值说明