我正在处理一个扩展到>50年的大型数据集。每年有1000万行有多个变量/列的记录。我需要按地点和时间执行群比操作。我的代码运行得非常慢--根据一年中的台站数量,处理一年的数据需要2到5个小时。我看了几篇关于多重处理的文章,但由于我对此没有经验,所以我不确定这种方法是否适用于我的问题。如果有人能指出我如何提高代码的效率,我将不胜感激。
#!/usr/bin/env python
# encoding: utf-8
import numpy as np
import pandas as pd
import datetime
import argparse
from scipy.stats.mstats import hmean
def Nstat(df):
duMW = [6,7,8,9,30,31,32,33,34,35,98]
d = {}
d['NMW'] = df['MW'].count()
d['NPW'] = df['PW'].count()
d['NDU'] = df.loc[ isd['RH']<=90,'MW'].isin(duMW).sum()
d['NDU6'] = (df.loc[ df['RH']<=90,'MW']==6 ).sum()
d['NDU7'] = (df.loc[ df['RH']<=90,'MW']==7 ).sum()
d['NDU8'] = (df.loc[ df['RH']<=90,'MW']==8 ).sum()
d['NDU9'] = (df.loc[ df['RH']<=90,'MW']==9 ).sum()
d['NDU30'] = (df.loc[ df['RH']<=90,'MW']==30).sum()
d['NDU31'] = (df.loc[ df['RH']<=90,'MW']==31).sum()
d['NDU32'] = (df.loc[ df['RH']<=90,'MW']==32).sum()
d['NDU33'] = (df.loc[ df['RH']<=90,'MW']==33).sum()
d['NDU34'] = (df.loc[ df['RH']<=90,'MW']==34).sum()
d['NDU35'] = (df.loc[ df['RH']<=90,'MW']==35).sum()
d['NDU98'] = (df.loc[ df['RH']<=90,'MW']==98).sum()
d['NDUpw'] = (df.loc[ df['RH']<=90,'PW']==3).sum()
d['VIS_Hvg'] = hmean(df.loc[df['VIS']>0,'VIS'])
d['Vi_Avg'] = df['Vi'].mean()
return pd.Series(d,index=['NMW','NPW',\
'NDU','NDU6','NDU7','NDU8','NDU9','NDU30','NDU31','NDU32',\
'NDU33','NDU34','NDU35','NDU98','NDUpw','VIS_Hvg','Vi_Avg'])
if __name__ =='__main__':
parser = argparse.ArgumentParser()
parser.add_argument("start_year",type=int,help='4-digit start year')
parser.add_argument("end_year",type=int,help='4-digit end year')
args = parser.parse_args()
years = np.arange(args.start_year,args.end_year)
dTypes = {
'NMW':'Int32',\
'NPW':'Int32',\
'NDU':'Int32',\
'NDU6':'Int32',\
'NDU7':'Int32',\
'NDU8':'Int32',\
'NDU9':'Int32',\
'NDU30':'Int32',\
'NDU31':'Int32',\
'NDU32':'Int32',\
'NDU33':'Int32',\
'NDU34':'Int32',\
'NDU35':'Int32',\
'NDU98':'Int32',\
'NDUpw':'Int32'\
}
for iyr,yr in enumerate(years):
print('process year {:d} at {:s}'.format(yr,datetime.datetime.now().strftime('%m-%d %H:%M:%S')))
isd = pd.read_hdf('isd_lite_'+str(yr)+'.h5',dtype={'STATION':'str'})
isd['YYYYMM'] = pd.to_datetime(isd['YYYYMMDDHH'],format='%Y%m%d%H').dt.strftime('%Y%m')
isd['VIS'] = isd['VIS']/1000.
isd['Vi'] = isd['VIS'].apply(lambda x: 1/x if x>0 else np.nan)
print('>> groupby and output at {:s}'.format(datetime.datetime.now().strftime('%m-%d %H:%M:%S')))
stn_month = isd.groupby(['STATION','YYYYMM']).apply(Nstat).reset_index().astype(dTypes)
stn_month.to_csv('stn_month_'+str(yr)+'.csv',index=False,float_format='%.3f')最后一组(by站和YYYYMM)操作时间最长。我有一个相当好的工作站(256个核心),并希望最大限度地利用它。
提供了一个示例文件这里。处理这个文件需要7分钟。不长得可怕,因为有少数几个电台。
发布于 2020-04-26 05:42:38
事实证明,我的脚本有一个错误,因此运行时间长得可笑。修复错误后,运行时会缩短,但代码本身仍然效率低下。真正的问题在于Nstat --基于行的计算同时具有CPU和内存的效率。对于那些感兴趣的人,阅读这篇文章。
多亏了@Juho,我删除了Nstat,改用了agg。运行时减少了一半以上!
#prescreening by RH>90%
isd.loc[ isd.RH>90, 'MW'] = 0
isd.loc[ isd.RH>90, 'PW'] = 0
stn_month = isd.groupby(['STATION',isd.DATE.dt.to_period('M')]).agg(
NMW=('MW','count'),\
NPW=('PW','count'),\
NDU=('MW',lambda x: x.isin(duMW).sum()),\
NDU6=('MW',lambda x: x.eq(6).sum()),\
NDU7=('MW',lambda x: x.eq(7).sum()),\
NDU8=('MW',lambda x: x.eq(8).sum()),\
NDU9=('MW',lambda x: x.eq(9).sum()),\
NDU30=('MW',lambda x: x.eq(30).sum()),\
NDU31=('MW',lambda x: x.eq(31).sum()),\
NDU32=('MW',lambda x: x.eq(32).sum()),\
NDU33=('MW',lambda x: x.eq(33).sum()),\
NDU34=('MW',lambda x: x.eq(34).sum()),\
NDU35=('MW',lambda x: x.eq(35).sum()),\
NDU98=('MW',lambda x: x.eq(98).sum()),\
NDUPW=('PW',lambda x: x.eq(3).sum()),\
VIS=('VIS',lambda x: hmean(x[x>0])),\
Vi=('Vi','mean'),\
DUP=('DUP','mean')\
).reset_index().astype(dTypes)
stn_month.to_csv('../stat/yearly/stn_all/stn_month_{:d}.csv'.format(yr),index=False,float_format='%.3f')https://codereview.stackexchange.com/questions/240888
复制相似问题