import pandas as pd
import numpy as np
import osos.getcwd()'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据统计'os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')df = pd.read_csv('online_order.csv', encoding='gbk', dtype={'customer':str, 'order':str})df.head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>customer</th> <th>order</th> <th>total_items</th> <th>discount%</th> <th>weekday</th> <th>hour</th> <th>Food%</th> <th>Fresh%</th> <th>Drinks%</th> <th>Home%</th> <th>Beauty%</th> <th>Health%</th> <th>Baby%</th> <th>Pets%</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>0</td> <td>0</td> <td>45</td> <td>23.03</td> <td>4</td> <td>13</td> <td>9.46</td> <td>87.06</td> <td>3.48</td> <td>0.00</td> <td>0.00</td> <td>0.00</td> <td>0.0</td> <td>0.0</td></tr><tr> <th>1</th> <td>0</td> <td>1</td> <td>38</td> <td>1.22</td> <td>5</td> <td>13</td> <td>15.87</td> <td>75.80</td> <td>6.22</td> <td>2.12</td> <td>0.00</td> <td>0.00</td> <td>0.0</td> <td>0.0</td></tr><tr> <th>2</th> <td>0</td> <td>2</td> <td>51</td> <td>18.08</td> <td>4</td> <td>13</td> <td>16.88</td> <td>56.75</td> <td>3.37</td> <td>16.48</td> <td>6.53</td> <td>0.00</td> <td>0.0</td> <td>0.0</td></tr><tr> <th>3</th> <td>1</td> <td>3</td> <td>57</td> <td>16.51</td> <td>1</td> <td>12</td> <td>28.81</td> <td>35.99</td> <td>11.78</td> <td>4.62</td> <td>2.87</td> <td>15.92</td> <td>0.0</td> <td>0.0</td></tr><tr> <th>4</th> <td>1</td> <td>4</td> <td>53</td> <td>18.31</td> <td>2</td> <td>11</td> <td>24.13</td> <td>60.38</td> <td>7.78</td> <td>7.72</td> <td>0.00</td> <td>0.00</td> <td>0.0</td> <td>0.0</td></tr></tbody>
</table>
</div>
grouped = df.groupby('weekday')# 只可传入一个统计参数
# agg可传入多个
# grouped.apply([np.mean, np.sum])grouped.apply(np.mean)[['total_items', 'discount%', 'weekday']]<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>total_items</th> <th>discount%</th> <th>weekday</th></tr><tr> <th>weekday</th> <th></th> <th></th> <th></th></tr></thead>
<tbody>
<tr> <th>1</th> <td>30.662177</td> <td>8.580705</td> <td>1.0</td></tr><tr> <th>2</th> <td>31.868612</td> <td>8.638014</td> <td>2.0</td></tr><tr> <th>3</th> <td>31.869796</td> <td>7.794507</td> <td>3.0</td></tr><tr> <th>4</th> <td>32.251899</td> <td>8.068155</td> <td>4.0</td></tr><tr> <th>5</th> <td>31.406619</td> <td>9.159031</td> <td>5.0</td></tr><tr> <th>6</th> <td>32.154814</td> <td>8.414258</td> <td>6.0</td></tr><tr> <th>7</th> <td>32.373837</td> <td>8.710171</td> <td>7.0</td></tr></tbody>
</table>
</div>
df.columnsIndex(['customer', 'order', 'total_items', 'discount%', 'weekday', 'hour', 'Food%', 'Fresh%', 'Drinks%', 'Home%', 'Beauty%', 'Health%', 'Baby%', 'Pets%'], dtype='object')var = ['Food%', 'Fresh%', 'Drinks%', 'Home%', 'Beauty%', 'Health%', 'Baby%',
'Pets%']df[var].head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>Food%</th> <th>Fresh%</th> <th>Drinks%</th> <th>Home%</th> <th>Beauty%</th> <th>Health%</th> <th>Baby%</th> <th>Pets%</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>9.46</td> <td>87.06</td> <td>3.48</td> <td>0.00</td> <td>0.00</td> <td>0.00</td> <td>0.0</td> <td>0.0</td></tr><tr> <th>1</th> <td>15.87</td> <td>75.80</td> <td>6.22</td> <td>2.12</td> <td>0.00</td> <td>0.00</td> <td>0.0</td> <td>0.0</td></tr><tr> <th>2</th> <td>16.88</td> <td>56.75</td> <td>3.37</td> <td>16.48</td> <td>6.53</td> <td>0.00</td> <td>0.0</td> <td>0.0</td></tr><tr> <th>3</th> <td>28.81</td> <td>35.99</td> <td>11.78</td> <td>4.62</td> <td>2.87</td> <td>15.92</td> <td>0.0</td> <td>0.0</td></tr><tr> <th>4</th> <td>24.13</td> <td>60.38</td> <td>7.78</td> <td>7.72</td> <td>0.00</td> <td>0.00</td> <td>0.0</td> <td>0.0</td></tr></tbody>
</table>
</div>
# 计算每个变量的总和
df[var].apply(np.sum, axis=0)Food% 706812.19Fresh% 606818.38Drinks% 700477.06Home% 406187.25Beauty% 176788.48Health% 33988.76Baby% 332884.34Pets% 31292.61dtype: float64# 对每一行求和
df[var].apply(np.sum, axis=1).head(5)0 100.001 100.012 100.013 99.994 100.01dtype: float64# Food% - Fresh%
df[var].apply(lambda x: x[0] - x[1], axis=1).head(5)0 -77.601 -59.932 -39.873 -7.184 -36.25dtype: float64原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。