数据准备是机器学习的基础,俗话说巧妇难为无米之炊,没有数据的机器学习就是耍流氓。
接下来将使用公众号其他成员分享的内容现学现卖一篇,文章中使用了我们公众号成员推荐的Xarray库、wrf-python库,目的是从WRF模式输出提取出站点在不同高度/等压面数据。
def nearest_position( stn_lat, stn_lon, xlat, xlon ):
"""获取最临近格点坐标索引
parameters:
----------------------
lon : 单个站点经度
lat : 单个站点纬度
lons : numpy.ndarray网格二维经度坐标
lats : numpy.ndarray网格二维纬度坐标
Return:
-----------------------
(xindx,yindx)
"""
import numpy as np
difflat = stn_lat - xlat;
difflon = stn_lon - xlon;
rad = np.multiply(difflat,difflat)+np.multiply(difflon , difflon)#difflat * difflat + difflon * difflon
aa=np.where(rad==np.min(rad))
ind=np.squeeze(np.array(aa))
return tuple(ind)
上述函数是为了得到距离某个站点在wrfout输出的二维经纬度上找到最近的格点索引,比较简单粗暴。实际上也可以用scipy.spatial中的cKDTree来做。
import xarray
file_name_nc= 'wrfout_dxx_2009-04-21_06_00_00'
ds = xarray.open_dataset(file_name_nc)
XLAT_WRF2D = np.squeeze(ds.coords['XLAT'].values)
XLONG_WRF2D=np.squeeze(ds.coords['XLONG'].values)
ind=nearest_position( 133.001 , 18.1944,XLONG_WRF2D, XLAT_WRF2D)
print('距离该站点最近的格点经纬度索引为:',ind)
print('距离站点最近格点的经度为:',XLONG_WRF2D[ind[0],ind[1]])
print('距离站点最近格点的纬度为:',XLAT_WRF2D[ind[0],ind[1]])
距离该站点最近的格点经纬度索引为: (96, 93)
第一个站点的经度为: 133.04703
第一个站点的纬度为: 18.168343
注:站点经纬度坐标是随便写的,以脱敏,如有雷同,纯属巧合。
上面的函数只是针对单个站点。用for循环进行批量操作,可以得到类似如下pandas dataframe结果,命名为get_stn:
Station_ID LONG LAT Xidx Yidx
6666666 129.996 33.3586 93.0 72.0
7777777 143.001 34.1944 73.0 63.0
8888888 128.663 22.1430 86.0 57.0
.
.
.
.
有了批量的站点信息,下面编写函数进行wrfout站点信息提取.
第二步 编写 读取站点信息函数def read_wrfout_stns_multilevels(flnm_wrfout,get_stn,features_3D,features_2D,features_special,interp_levels,features_all,interp_type='ght_agl'):
'''
作者:MeteoAI
--------------------------------------------------------------------
参数:
flnm_wrfout:单个WRF输出文件名称
get_stn:Pandas dataframe 表格文件,列名为 站点ID ,站点经度, 站点纬度, 最近格点经度索引,最近格点纬度索引
features_3D :wrfout中三维变量,如u,v,T
features_2D : wrfout中的二维变量: 如降雨 、短波辐射
features_special: 可以是PBLH等二维变量
interp_levels: 插值层列表, 比如高度层列表[0.001,0.01,0.02,0.3]
interp_types :插值种类
插值方式暂时使用两种
高度层垂向插值 ght_agl: above groud level,单位Km
等压面 :pressure 单位hPa
详细信息可以help(wrf.vinterp),有更多的插值种类
'''
import os
import arrow
import numpy as np
import wrf
import netCDF4 as nc
import xarray as xr
import pandas as pd
from netCDF4 import Dataset
from wrf import getvar, vinterp,interplevel
ds = xr.open_dataset(flnm_wrfout)
data_nc=nc.Dataset(flnm_wrfout)
wrfin =data_nc
if interp_type == 'ght_agl':
units='Km'
name_dim0='height'
else:
units='hPa'
name_dim0='pressure'
########################################
va=getvar(wrfin,"va")
ua=getvar(wrfin,"ua")
##########################
rh=getvar(wrfin,"rh")#
RH_interp=wrf.vinterp(data_nc,np.squeeze(rh.values),interp_type,interp_levels,extrapolate=True)
UU_interp=wrf.vinterp(data_nc,np.squeeze(ua.values),interp_type,interp_levels,extrapolate=True)
VV_interp=wrf.vinterp(data_nc,np.squeeze(va.values),interp_type,interp_levels,extrapolate=True)
PR=ds['PB'] +ds['P']
pblh = getvar(wrfin, "PBLH")
tc=getvar(wrfin,"tc")# 摄氏度温度
T_interp=wrf.vinterp(data_nc,np.squeeze(tc.values),interp_type,interp_levels,extrapolate=True)
PR_interp=wrf.vinterp(data_nc,np.squeeze(PR.values),interp_type,interp_levels,extrapolate=True)/100#
#P_interp
myds=xr.Dataset() #
myds['UU'] = UU_interp
myds['VV'] = VV_interp
myds['TA'] =T_interp
myds['PR']=PR_interp#np.squeeze((ds['PB']+ds['P']))
myds['RH']=RH_interp
myds['RA']=ds['RAINC']+ds['RAINNC']# getvar(wrfin,"pw") # #
myds['SW']=ds['SWDOWN']
# pressure is just name , in fact it could alos stands for height in this code if interp_type = ght_agl.
myds=myds.rename({"dim_0": name_dim0 , "dim_1": "south_north","dim_2": "west_east"})#
myds.coords['Pressure'] = xr.DataArray(interp_levels, dims='pressure')
myds.coords['XLAT']=xr.DataArray(np.squeeze(ds.coords['XLAT']))
myds.coords['XLONG']=xr.DataArray(np.squeeze(ds.coords['XLONG']))
XLAT_WRF2D = myds.coords['XLAT']
XLAT_WRF2D = np.squeeze(XLAT_WRF2D.values)
XLONG_WRF2D=myds.coords['XLONG']
XLONG_WRF2D=np.squeeze(XLONG_WRF2D.values)
df_empty=pd.DataFrame(columns=features_all,index=get_stn.index)
interp_levels_index=np.arange(len(interp_levels))
for idx ,value in get_stn.iterrows(): #
yidx=int(value[1])
xidx=int(value[0])
#lon_value, lat_value ,xidx ,yidx=nearest_point(lon, lat, XLONG_WRF2D, XLAT_WRF2D)
for each_var in features_3D:
for each_level_index in interp_levels_index:
#print(interp_levels[each_level_index])
if interp_type == 'ght_agl':
v=myds[each_var][dict(height=each_level_index,south_north=xidx,west_east=yidx)]#myds['UU'][dict(pressure=1,south_north=6,west_east=7)]#
feature_name=each_var+'_'+str(interp_levels[each_level_index])+units
df_empty.loc[idx][feature_name]=v.values
else:
v=myds[each_var][dict(pressure=each_level_index,south_north=xidx,west_east=yidx)]#myds['UU'][dict(pressure=1,south_north=6,west_east=7)]#
feature_name=each_var+'_'+str(interp_levels[each_level_index])+units
df_empty.loc[idx][feature_name]=v.values
for each_var in features_2D:
v=myds[each_var][dict(south_north=xidx,west_east=yidx)] #myds['UU'][dict(pressure=1,south_north=6,west_east=7)]#
feature_name=each_var
df_empty.loc[idx][each_var]=v.values[0]
if 'PBLH' in features_special:
pblh_v=np.squeeze(pblh[dict(south_north=xidx,west_east=yidx)].values)#JUST A number
df_empty.loc[idx]['PBLH']=pblh_v
return df_empty
测试interp_levels = [0,0.002,0.004,0.008,0.01,0.015,0.02,0.25,0.50,0.75,1.50,3.00,5.00]
#########################################################################
features_special=['PBLH']
features_2D=['RA','SW']
features_3D=['UU','VV','TA','PR','RH']#['UU']
features_km_list=[]
for each in features_3D:
for km in (interp_levels):
features_km_list.append(each+'_'+str(km)+'Km')
features_all=[]
features_all.extend(features_km_list)
features_all.extend(features_special)
features_all.extend(features_2D)
#print(features_all)
stns_info=read_wrfout_stns_multilevels(flnm_wrfout,
get_stn,
features_3D,
features_2D,
features_special,
interp_levels,
features_all,
interp_type='ght_agl')
stns_info.head(3)