import pandas as pd
import os
import numpy as np
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})
df['购买量'] = np.where(df['buy_mount']>3, '高', '低')
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
<th>购买量</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>41098319944</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
<td>低</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>17916191097</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
<td>低</td>
</tr>
<tr>
<th>2</th>
<td>249013725</td>
<td>21896936223</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
<td>低</td>
</tr>
<tr>
<th>3</th>
<td>917056007</td>
<td>12515996043</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
<td>低</td>
</tr>
<tr>
<th>4</th>
<td>444069173</td>
<td>20487688075</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
<td>低</td>
</tr>
</tbody>
</table>
</div>
# 将第二列放在第一列
auction_id = df['auction_id']
del df['auction_id']
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
<th>购买量</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
<td>低</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
<td>低</td>
</tr>
<tr>
<th>2</th>
<td>249013725</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
<td>低</td>
</tr>
<tr>
<th>3</th>
<td>917056007</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
<td>低</td>
</tr>
<tr>
<th>4</th>
<td>444069173</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
<td>低</td>
</tr>
</tbody>
</table>
</div>
# 第一个参数:插入位置
# 第二个参数:标签名称
# 第三个参数:数据
df.insert(0, 'auction_id_new', auction_id)
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>auction_id_new</th>
<th>user_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
<th>购买量</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>41098319944</td>
<td>786295544</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
<td>低</td>
</tr>
<tr>
<th>1</th>
<td>17916191097</td>
<td>532110457</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
<td>低</td>
</tr>
<tr>
<th>2</th>
<td>21896936223</td>
<td>249013725</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
<td>低</td>
</tr>
<tr>
<th>3</th>
<td>12515996043</td>
<td>917056007</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
<td>低</td>
</tr>
<tr>
<th>4</th>
<td>20487688075</td>
<td>444069173</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
<td>低</td>
</tr>
</tbody>
</table>
</div>
# 删除两列数据
df.drop(labels=['auction_id_new', '购买量'], axis=1).head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>2</th>
<td>249013725</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>3</th>
<td>917056007</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
</tr>
<tr>
<th>4</th>
<td>444069173</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
</tr>
</tbody>
</table>
</div>
# 再次查看df,发现刚才删除数据仍然存在
# 因为没有对原数据生效
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>auction_id_new</th>
<th>user_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
<th>购买量</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>41098319944</td>
<td>786295544</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
<td>低</td>
</tr>
<tr>
<th>1</th>
<td>17916191097</td>
<td>532110457</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
<td>低</td>
</tr>
<tr>
<th>2</th>
<td>21896936223</td>
<td>249013725</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
<td>低</td>
</tr>
<tr>
<th>3</th>
<td>12515996043</td>
<td>917056007</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
<td>低</td>
</tr>
<tr>
<th>4</th>
<td>20487688075</td>
<td>444069173</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
<td>低</td>
</tr>
</tbody>
</table>
</div>
df.drop(labels=['auction_id_new', '购买量'], axis=1, inplace=True)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-30-acf2a75acaf3> in <module>
----> 1 df.drop(labels=['auction_id_new', '购买量'], axis=1, inplace=True)
D:\Anaconda3\lib\site-packages\pandas\core\frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3938 index=index, columns=columns,
3939 level=level, inplace=inplace,
-> 3940 errors=errors)
3941
3942 @rewrite_axis_style_signature('mapper', [('copy', True),
D:\Anaconda3\lib\site-packages\pandas\core\generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3778 for axis, labels in axes.items():
3779 if labels is not None:
-> 3780 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
3781
3782 if inplace:
D:\Anaconda3\lib\site-packages\pandas\core\generic.py in _drop_axis(self, labels, axis, level, errors)
3810 new_axis = axis.drop(labels, level=level, errors=errors)
3811 else:
-> 3812 new_axis = axis.drop(labels, errors=errors)
3813 result = self.reindex(**{axis_name: new_axis})
3814
D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in drop(self, labels, errors)
4963 if errors != 'ignore':
4964 raise KeyError(
-> 4965 '{} not found in axis'.format(labels[mask]))
4966 indexer = indexer[~mask]
4967 return self.delete(indexer)
KeyError: "['auction_id_new' '购买量'] not found in axis"
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>2</th>
<td>249013725</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>3</th>
<td>917056007</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
</tr>
<tr>
<th>4</th>
<td>444069173</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
</tr>
</tbody>
</table>
</div>
# 删除标签为3,4数据
df.drop(labels=[3,4], axis=0, inplace=True)
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>2</th>
<td>249013725</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>5</th>
<td>152298847</td>
<td>121394024</td>
<td>50008168</td>
<td>21458:3408353;13023209:727117752;22009:2741771...</td>
<td>1</td>
<td>20141103</td>
</tr>
<tr>
<th>6</th>
<td>513441334</td>
<td>50010557</td>
<td>50008168</td>
<td>25935:21991;1628665:29784;22019:34731;22019:20...</td>
<td>1</td>
<td>20121212</td>
</tr>
</tbody>
</table>
</div>
df.drop(labels=range(0,3), axis=0, inplace=True)
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>5</th>
<td>152298847</td>
<td>121394024</td>
<td>50008168</td>
<td>21458:3408353;13023209:727117752;22009:2741771...</td>
<td>1</td>
<td>20141103</td>
</tr>
<tr>
<th>6</th>
<td>513441334</td>
<td>50010557</td>
<td>50008168</td>
<td>25935:21991;1628665:29784;22019:34731;22019:20...</td>
<td>1</td>
<td>20121212</td>
</tr>
<tr>
<th>7</th>
<td>297411659</td>
<td>50010542</td>
<td>50008168</td>
<td>21458:60020529;25935:31381;1633959:27247291;16...</td>
<td>1</td>
<td>20121212</td>
</tr>
<tr>
<th>8</th>
<td>82830661</td>
<td>50013874</td>
<td>28</td>
<td>21458:11580;21475:137325</td>
<td>1</td>
<td>20121101</td>
</tr>
<tr>
<th>9</th>
<td>475046636</td>
<td>203527</td>
<td>28</td>
<td>22724:40168;22729:40278;21458:21817;2770200:24...</td>
<td>1</td>
<td>20121101</td>
</tr>
</tbody>
</table>
</div>
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。