昨日有读者说想看看EOF和小波分析,近期也在搞xarray的推文,就拿xarray的数据直接做了。
小波分析的话其他博主已经写过pycwt库的实现了,就直接转载了(偷懒)
本想写几种eof库比较的,在pyeof和xeof安装都失败后还是简单写一下eofs
In [54]:
from eofs.standard import Eof
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def get_time_space(df, time_dim, lumped_space_dims):
"""
Get a DataFrame with the dim: [row (e.g.,time), column (e.g, space)]
Parameters
----------
df : pandas.core.frame.DataFrame
| time | lat | lon | PM2.5 |
| 2011 | 66 | 88 | 22 |
| 2011 | 66 | 99 | 23 |
| 2012 | 66 | 88 | 24 |
| 2012 | 66 | 99 | 25 |
time_dim:
time dim name (str), e.g., "time"
lumped_space_dims:
a list of the lumped space, e.g., ["lat","lon"]
Returns
-------
dataframe:
with [row (e.g.,time), column (e.g, space)], e.g.,
| time | loc1 | loc2 |
| | (66,88) | (66,99) |
| 2011 | 22 | 24 |
| 2012 | 23 | 25 |
"""
return df.set_index([time_dim]+lumped_space_dims).unstack(lumped_space_dims)
In [80]:
import xarray as xr
# 数据
f = '/home/mw/input/cru3546/cru_ts4.07.2021.2022.pre.dat.nc'
# 打开数据集
ds = xr.open_dataset(f)
ds = ds.sel(lat=slice(20, 60), lon=slice(80, 130))
ds
Out[80]:
In [81]:
df = ds.to_dataframe().reset_index() # get df from da
df=df.drop(columns=['stn'])
df
Out[81]:
lon | lat | time | pre | |
---|---|---|---|---|
0 | 80.25 | 20.25 | 2021-01-16 | 1.800000 |
1 | 80.25 | 20.25 | 2021-02-15 | 3.400000 |
2 | 80.25 | 20.25 | 2021-03-16 | 31.100000 |
3 | 80.25 | 20.25 | 2021-04-16 | 14.000000 |
4 | 80.25 | 20.25 | 2021-05-16 | 22.800001 |
... | ... | ... | ... | ... |
191995 | 129.75 | 59.75 | 2022-08-16 | 86.500000 |
191996 | 129.75 | 59.75 | 2022-09-16 | 61.400002 |
191997 | 129.75 | 59.75 | 2022-10-16 | 34.799999 |
191998 | 129.75 | 59.75 | 2022-11-16 | 26.900000 |
191999 | 129.75 | 59.75 | 2022-12-16 | 17.700001 |
192000 rows × 4 columns
In [82]:
df_data = get_time_space(df, time_dim = "time", lumped_space_dims = ["lat","lon"])
display(df_data.head(5))
print("DataFrame Shape:",df_data.shape)
pre | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
lat | 20.25 | 20.75 | 21.25 | 21.75 | 22.25 | 22.75 | 23.25 | 23.75 | 24.25 | 24.75 | ... | 55.25 | 55.75 | 56.25 | 56.75 | 57.25 | 57.75 | 58.25 | 58.75 | 59.25 | 59.75 |
lon | 80.25 | 80.25 | 80.25 | 80.25 | 80.25 | 80.25 | 80.25 | 80.25 | 80.25 | 80.25 | ... | 129.75 | 129.75 | 129.75 | 129.75 | 129.75 | 129.75 | 129.75 | 129.75 | 129.75 | 129.75 |
time | |||||||||||||||||||||
2021-01-16 | 1.800000 | 1.800000 | 2.400000 | 3.700000 | 5.400000 | 6.3 | 5.100000 | 4.700000 | 5.300000 | 5.900000 | ... | 8.900001 | 9.100000 | 10.000000 | 10.000000 | 10.200000 | 10.600000 | 11.300000 | 11.600000 | 10.700000 | 9.800000 |
2021-02-15 | 3.400000 | 3.500000 | 3.700000 | 4.700000 | 6.700000 | 7.8 | 5.700000 | 4.200000 | 4.400000 | 4.600000 | ... | 14.200000 | 15.900001 | 16.100000 | 15.500000 | 14.500000 | 13.800000 | 13.500000 | 13.700000 | 14.000000 | 13.700000 |
2021-03-16 | 31.100000 | 35.100002 | 41.600002 | 53.400002 | 75.900002 | 78.0 | 69.700005 | 62.100002 | 63.700001 | 60.100002 | ... | 27.900000 | 31.100000 | 28.700001 | 26.700001 | 22.900000 | 20.400000 | 18.300001 | 16.400000 | 16.200001 | 15.400001 |
2021-04-16 | 14.000000 | 9.900001 | 7.100000 | 5.600000 | 6.600000 | 6.6 | 4.600000 | 3.400000 | 3.800000 | 4.000000 | ... | 59.000000 | 62.400002 | 57.299999 | 56.299999 | 50.600002 | 45.400002 | 39.400002 | 27.600000 | 25.600000 | 25.300001 |
2021-05-16 | 22.800001 | 20.300001 | 18.400000 | 22.800001 | 27.600000 | 25.4 | 19.900000 | 13.600000 | 9.500000 | 6.600000 | ... | 117.900002 | 126.099998 | 109.200005 | 92.800003 | 72.200005 | 58.000000 | 48.100002 | 34.799999 | 28.700001 | 23.300001 |
5 rows × 8000 columns
DataFrame Shape: (24, 8000)
In [86]:
import matplotlib.pyplot as plt
def visualization(da, pcs, eofs_da, evf):
n = len(pcs[0]) # 获取模态数
fig, axs = plt.subplots(n+1, 2, figsize=(20, 6*(n+1)))
da.mean(dim=["lat","lon"]).plot(ax=axs[0, 0])
axs[0, 0].set_title("average pre")
da.mean(dim="time").plot(ax=axs[0, 1])
axs[0, 1].set_title("average pre")
for i in range(1, n+1):
pc_i = xr.DataArray(pcs[:,i-1],dims=["time"])
pc_i = pc_i.assign_coords(time=ds.time)
eof_i = eofs_da.sel(mode=i-1)
frac = str((evf[i-1] * 100).round(2))
axs[i, 0].plot(pc_i)
axs[i, 0].set_title("PC"+str(i)+" ("+frac+"%)")
img = eof_i.plot(x='lon',y='lat',ax=axs[i, 1], vmin=-0.75, vmax=0.75, cmap="RdBu_r", add_colorbar=False)
axs[i, 1].set_title("EOF"+str(i)+" ("+frac+"%)")
fig.colorbar(img, ax=axs[i, 1])
plt.tight_layout()
plt.show()
In [87]:
from eofs.standard import Eof
from sklearn.preprocessing import StandardScaler
n=4
# 使用StandardScaler对数据进行标准化处理
scaler = StandardScaler()
df_data_scaled = scaler.fit_transform(df_data.values)
# 创建EOFs对象
solver = Eof(df_data_scaled)
# 计算主分量分数(PCs)
s_pcs = solver.pcs(npcs=4, pcscaling=2)
# 计算模态函数(EOFs)
s_eofs =solver.eofs(neofs=4, eofscaling=2)
s_eofs = s_eofs.reshape(4,100,80)
# 创建xarray数组
lon = ds.lon
lat = ds.lat
mode = np.arange(0,n)
s_eofs_ds = xr.DataArray(s_eofs,coords=[mode,lon, lat], dims=["mode", "lon", "lat"])
# 计算解释方差比例
s_evfs = solver.varianceFraction(neigs=4)
visualization(ds['pre'], s_pcs, s_eofs_ds, s_evfs)
/opt/conda/lib/python3.7/site-packages/sklearn/utils/extmath.py:770: RuntimeWarning: invalid value encountered in true_divide
updated_mean = (last_sum + new_sum) / updated_sample_count
/opt/conda/lib/python3.7/site-packages/sklearn/utils/extmath.py:709: RuntimeWarning: Degrees of freedom <= 0 for slice.
result = op(x, *args, **kwargs, dtype=np.float64)
In [91]:
coslat = np.array(np.sqrt(np.cos(np.deg2rad(ds.lat)))).reshape((1,80))
# 创建EOFs对象
solver1 = Eof(df_data_scaled.reshape(24,100,80),weights = coslat)
# 计算主分量分数(PCs)
s_pcs1 = solver1.pcs(npcs=4, pcscaling=2)
# 计算模态函数(EOFs)
s_eofs1 =solver.eofs(neofs=4, eofscaling=2)
s_eofs1 = s_eofs.reshape(4,100,80)
# 创建xarray数组
lon = ds.lon
lat = ds.lat
mode = np.arange(0,n)
s_eofs_ds1 = xr.DataArray(s_eofs1,coords=[mode,lon, lat], dims=["mode", "lon", "lat"])
# 计算解释方差比例
s_evfs1 = solver.varianceFraction(neigs=4)
visualization(ds['pre'], s_pcs1, s_eofs_ds1, s_evfs1)
权重和不加权重看起来差异不大(甚至可以说没有),也许是数据量不够大,可自行测试别的长时间序列数据