https://github.com/Avik-Jain/100-Days-Of-ML-Code/blob/master/Code/Day2_Simple_Linear_Regression.md
import pandas as pd
df = pd.read_csv('Desktop/Data_analysis_practice/100-Days-Of-ML/studentscores.csv')
df.shape
df.columns
数据总共25行2列,第一列是时间Hours,第二列是得分Scores
df.iloc[0,0] # 取第一行第一列的数字
Out[18]: 2.5
df.iloc[0,:] # 取第一行所有列
Out[19]:
Hours 2.5
Scores 21.0
Name: 0, dtype: float64
df.iloc[:,0] # 取第一列所有行
Out[20]:
0 2.5
1 5.1
2 3.2
3 8.5
4 3.5
5 1.5
6 9.2
7 5.5
8 8.3
9 2.7
10 7.7
11 5.9
12 4.5
13 3.3
14 1.1
15 8.9
16 2.5
17 1.9
18 6.1
19 7.4
20 2.7
21 4.8
22 3.8
23 6.9
24 7.8
Name: Hours, dtype: float64
# 取第一列作为X,第二列作为Y
X = df.iloc[:,0].values
Y = df.iloc[:,1].values
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25,random_state=0)
import numpy as np
X_trainNew = np.array(X_train).reshape(-1,1)
regressor = regressor.fit(X_trainNew,Y_train)
X_testNew = np.array(X_test).reshape(-1,1)
Y_pred = regressor.predict(X_testNew)
import matplotlib.pyplot as plt
plt.scatter(X_trainNew,Y_train,color='red')
plt.plot(X_trainNew,regressor.predict(X_trainNew),color='blue')
plt.show()
image.png
plt.scatter(X_testNew,Y_test,color='red')
plt.plot(X_testNew,regressor.predict(X_testNew),color='blue')
plt.show()
image.png
regressor.score(X_testNew,Y_test)
0.93676610433650542