#MatLab 数据分析 制作图标
#Excel 在设计的时候 做的是小数据处理 数据分析师 7k 8k
#tableau 处理的数据比Excel大一些 mysql postgreSQL Hadoop(分布式文件存储)
#python 和 C 和 Java 和 Go 操控的时候 速度非常的快 DataFrame
In [1]:
import matplotlib.pyplot as plt
import numpy as np
#pip install seaborn
import seaborn as sns
#set_style()全局函数 white 白色 whitegrid 白色网格 dark 暗色 darkgrid 暗网格
sns.set_style('dark')
等差数列
In [19]:
# f(x) = wx +b
plt.plot(np.arange(0,10))
#y=x
plt.axis('image')
Out[19]:
(-0.45, 9.45, -0.45, 9.45)
In [20]:
X = np.linspace(0,10,10)
#广播机制
y = 5 * X + 3
In [21]:
plt.plot(X,y)
Out[21]:
[<matplotlib.lines.Line2D at 0x7f91032ef8d0>]
In [24]:
X = np.linspace(0,10,10)
y = np.sin(X)
plt.plot(X,y,color='red')
Out[24]:
[<matplotlib.lines.Line2D at 0x7f910b20c518>]
np.savetxt()
np.loadtxt()
In [6]:
#delimiter 分隔符
#object 在np当中代表的是字符串
AAPL = np.loadtxt('AAPL.csv',delimiter=',',dtype='object')
In [7]:
columns = AAPL[0]
data=AAPL[1:]
In [8]:
columns
#时间 开盘价格 最高价格 最低价格 闭盘价格 调整以后的闭盘价格 交易量
Out[8]:
array(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'],
dtype=object)
In [9]:
data
Out[9]:
array([['1980-12-12', '0.513393', '0.515625', ..., '0.513393',
'0.408971', '117258400'],
['1980-12-15', '0.488839', '0.488839', ..., '0.486607',
'0.387633', '43971200'],
['1980-12-16', '0.453125', '0.453125', ..., '0.450893',
'0.359183', '26432000'],
...,
['2019-08-22', '213.190002', '214.440002', ..., '212.460007',
'212.460007', '22253700'],
['2019-08-23', '209.429993', '212.050003', ..., '202.639999',
'202.639999', '46818000'],
['2019-08-26', '205.860001', '207.190002', ..., '206.490005',
'206.490005', '26043600']], dtype=object)
In [11]:
#pandas
import pandas as pd
In [23]:
pd.read_csv('AAPL.csv').iloc[:1000,:-1].plot(figsize=(18,10))
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f910acf90f0>
轴控制 plt.axis()
将椭圆优化成圆形
标题的设置 plt.title()
X轴的标记 plt.xlabel() y轴的标记 plt.ylabel()
图例 plt.legend() 配合 plot()中的属性 label
plot 的参数
In [96]:
X = np.linspace(-1,1,50)
y = (1-X**2)**0.5
plt.plot(X,y,label='up',alpha=0.5,marker='x',markersize=20)
plt.plot(X,-y,label='down',alpha=0.5,marker='s')
# plt.axis('equal')
plt.axis('image')
# plt.axis('off')
#plt.axis([-10,10,-10,10])
plt.title('this is circle')
plt.xlabel('X',color='b',size=20,rotation=30,alpha=0.5)
plt.ylabel('f(x) = (1-X^2)^0.5',color='r',size=20,rotation=90)
plt.legend(loc=[0,1],ncol=2)
Out[96]:
<matplotlib.legend.Legend at 0x7f9101c7ecc0>
1.直方图
In [7]:
X = np.random.randint(0,10,10)
X
Out[7]:
array([1, 6, 1, 9, 3, 4, 2, 3, 6, 7])
In [8]:
plt.hist(X,bins=50)
Out[8]:
(array([2., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0.,
0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0.,
0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]),
array([1. , 1.16, 1.32, 1.48, 1.64, 1.8 , 1.96, 2.12, 2.28, 2.44, 2.6 ,
2.76, 2.92, 3.08, 3.24, 3.4 , 3.56, 3.72, 3.88, 4.04, 4.2 , 4.36,
4.52, 4.68, 4.84, 5. , 5.16, 5.32, 5.48, 5.64, 5.8 , 5.96, 6.12,
6.28, 6.44, 6.6 , 6.76, 6.92, 7.08, 7.24, 7.4 , 7.56, 7.72, 7.88,
8.04, 8.2 , 8.36, 8.52, 8.68, 8.84, 9. ]),
<a list of 50 Patch objects>)
In [10]:
#sns在显示的时候会多添加一条密度图
sns.distplot(X,bins=50)
/home/admin_/anaconda3/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb9e8671898>
需要是二维数据
In [2]:
t = sns.load_dataset('titanic')
In [3]:
t
. . .
In [5]:
plt.bar(t['sex'],t['survived'])
Out[5]:
<BarContainer object of 891 artists>
In [12]:
#hue 条件分类
sns.barplot(x='sex', y='survived',hue='pclass',data=t)
/home/admin_/anaconda3/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb9e851d9e8>
饼图表示的是比例
In [15]:
#未占满
plt.pie([0.1,0.2,0.33,0.1,0.16])
plt.axis('image')
Out[15]:
(-1.1145098758080207,
1.1006909464670487,
-1.1147008258733009,
1.1177807984384611)
#马云 王建林 马化腾 许家印 李野
#1000 800 1200 900 2000
#np.argmin() 找到最小值的索引
#np.argmax()
In [34]:
#使用sns调节颜色
#sns.color_palatte()
#palette=hls 彩虹色, n_colors=有几个颜色
# palette : 调色板
# pal : 广播
#sns.palplot() 显示颜色
sns.palplot(sns.color_palette('hls',10))
In [38]:
m = np.array([1000,800,1200,900,2000])
min_index = np.argmin(m)
name = ['MY','WJL','MHT','XJY','YE']
exp = np.zeros(shape=m.size)
exp[min_index] = 0.1
#分离度 值越大,分离的越远
plt.pie(m,labels=name,autopct='%.2f%%',explode=exp,colors=sns.color_palette('hls',5))
plt.axis('equal')
Out[38]:
(-1.1105146598005549,
1.1005006980857408,
-1.1100114113997077,
1.213478078652238)
In [6]:
import seaborn as sns
iris = sns.load_dataset('iris')
iris
#样本 行 150个
#特征4个 标记1个
#3种花 每一个类别50个
#数据有几个维度
. . .
In [7]:
target = [0]*50+[1]*50+[2]*50
In [9]:
import matplotlib.pyplot as plt
#c class必须是一个序列,元素的数量等于样本的数量,必须是数字
plt.scatter(x='sepal_length',y='sepal_width',data=iris,c=target,cmap='spring')
Out[9]:
<matplotlib.collections.PathCollection at 0x1b66980b5f8>
In [10]:
plt.scatter(x='petal_length',y='petal_width',data=iris,c=target,cmap='spring')
Out[10]:
<matplotlib.collections.PathCollection at 0x1b6698ad4a8>
In [11]:
sns.regplot(x='petal_length',y='petal_width',data=iris)
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b6698adda0>
In [12]:
sns.regplot(x='sepal_length',y='sepal_width',data=iris)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b6698ed898>
In [14]:
#散布图举证
#hue 条件分类
# pair : 配对
sns.pairplot(iris,hue='species')
Out[14]:
<seaborn.axisgrid.PairGrid at 0x1b66998c7f0>
In [ ]: