import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据预处理'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('MotorcycleData.csv', encoding='gbk', na_values='Na')
def f(x):
if '$' in str(x):
x = str(x).strip('$')
x = str(x).replace(',', '')
else:
x = str(x).replace(',', '')
return float(x)
df['Price'] = df['Price'].apply(f)
df['Mileage'] = df['Mileage'].apply(f)
# 计算缺失比例
df.apply(lambda x: sum(x.isnull())/len(x), axis=0)
Condition 0.000000
Condition_Desc 0.778994
Price 0.000000
Location 0.000267
Model_Year 0.000534
Mileage 0.003470
Exterior_Color 0.095422
Make 0.000534
Warranty 0.318297
Model 0.016415
Sub_Model 0.676231
Type 0.197785
Vehicle_Title 0.964233
OBO 0.008808
Feedback_Perc 0.117710
Watch_Count 0.530629
N_Reviews 0.000801
Seller_Status 0.083411
Vehicle_Tile 0.007207
Auction 0.002269
Buy_Now 0.031630
Bid_Count 0.707727
dtype: float64
df.head(3)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Condition</th>
<th>Condition_Desc</th>
<th>Price</th>
<th>Location</th>
<th>Model_Year</th>
<th>Mileage</th>
<th>Exterior_Color</th>
<th>Make</th>
<th>Warranty</th>
<th>Model</th>
<th>...</th>
<th>Vehicle_Title</th>
<th>OBO</th>
<th>Feedback_Perc</th>
<th>Watch_Count</th>
<th>N_Reviews</th>
<th>Seller_Status</th>
<th>Vehicle_Tile</th>
<th>Auction</th>
<th>Buy_Now</th>
<th>Bid_Count</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>Used</td>
<td>mint!!! very low miles</td>
<td>11412.0</td>
<td>McHenry, Illinois, United States</td>
<td>2013.0</td>
<td>16000.0</td>
<td>Black</td>
<td>Harley-Davidson</td>
<td>Unspecified</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>8.1</td>
<td>NaN</td>
<td>2427</td>
<td>Private Seller</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>28.0</td>
</tr>
<tr>
<th>1</th>
<td>Used</td>
<td>Perfect condition</td>
<td>17200.0</td>
<td>Fort Recovery, Ohio, United States</td>
<td>2016.0</td>
<td>60.0</td>
<td>Black</td>
<td>Harley-Davidson</td>
<td>Vehicle has an existing warranty</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>17</td>
<td>657</td>
<td>Private Seller</td>
<td>Clear</td>
<td>True</td>
<td>TRUE</td>
<td>0.0</td>
</tr>
<tr>
<th>2</th>
<td>Used</td>
<td>NaN</td>
<td>3872.0</td>
<td>Chicago, Illinois, United States</td>
<td>1970.0</td>
<td>25763.0</td>
<td>Silver/Blue</td>
<td>BMW</td>
<td>Vehicle does NOT have an existing warranty</td>
<td>R-Series</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>NaN</td>
<td>136</td>
<td>NaN</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>26.0</td>
</tr>
</tbody>
</table>
<p>3 rows × 22 columns</p>
</div>
# how = 'all', 只有当前行都是缺失值才删除
# how = 'any', 只要当前行有一个缺失值就删除
df.dropna(how = 'any', axis=0)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Condition</th>
<th>Condition_Desc</th>
<th>Price</th>
<th>Location</th>
<th>Model_Year</th>
<th>Mileage</th>
<th>Exterior_Color</th>
<th>Make</th>
<th>Warranty</th>
<th>Model</th>
<th>...</th>
<th>Vehicle_Title</th>
<th>OBO</th>
<th>Feedback_Perc</th>
<th>Watch_Count</th>
<th>N_Reviews</th>
<th>Seller_Status</th>
<th>Vehicle_Tile</th>
<th>Auction</th>
<th>Buy_Now</th>
<th>Bid_Count</th>
</tr>
</thead>
<tbody>
</tbody>
</table>
<p>0 rows × 22 columns</p>
</div>
# subset 根据指定字段判断
# df.dropna(how='any', subset=['Condition', 'Price', 'Mileage'])
# 缺失值使用0填补
df.fillna(0).head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Condition</th>
<th>Condition_Desc</th>
<th>Price</th>
<th>Location</th>
<th>Model_Year</th>
<th>Mileage</th>
<th>Exterior_Color</th>
<th>Make</th>
<th>Warranty</th>
<th>Model</th>
<th>...</th>
<th>Vehicle_Title</th>
<th>OBO</th>
<th>Feedback_Perc</th>
<th>Watch_Count</th>
<th>N_Reviews</th>
<th>Seller_Status</th>
<th>Vehicle_Tile</th>
<th>Auction</th>
<th>Buy_Now</th>
<th>Bid_Count</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>Used</td>
<td>mint!!! very low miles</td>
<td>11412.0</td>
<td>McHenry, Illinois, United States</td>
<td>2013.0</td>
<td>16000.0</td>
<td>Black</td>
<td>Harley-Davidson</td>
<td>Unspecified</td>
<td>Touring</td>
<td>...</td>
<td>0</td>
<td>FALSE</td>
<td>8.1</td>
<td>0</td>
<td>2427</td>
<td>Private Seller</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>28.0</td>
</tr>
<tr>
<th>1</th>
<td>Used</td>
<td>Perfect condition</td>
<td>17200.0</td>
<td>Fort Recovery, Ohio, United States</td>
<td>2016.0</td>
<td>60.0</td>
<td>Black</td>
<td>Harley-Davidson</td>
<td>Vehicle has an existing warranty</td>
<td>Touring</td>
<td>...</td>
<td>0</td>
<td>FALSE</td>
<td>100</td>
<td>17</td>
<td>657</td>
<td>Private Seller</td>
<td>Clear</td>
<td>True</td>
<td>TRUE</td>
<td>0.0</td>
</tr>
<tr>
<th>2</th>
<td>Used</td>
<td>0</td>
<td>3872.0</td>
<td>Chicago, Illinois, United States</td>
<td>1970.0</td>
<td>25763.0</td>
<td>Silver/Blue</td>
<td>BMW</td>
<td>Vehicle does NOT have an existing warranty</td>
<td>R-Series</td>
<td>...</td>
<td>0</td>
<td>FALSE</td>
<td>100</td>
<td>0</td>
<td>136</td>
<td>0</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>26.0</td>
</tr>
<tr>
<th>3</th>
<td>Used</td>
<td>CLEAN TITLE READY TO RIDE HOME</td>
<td>6575.0</td>
<td>Green Bay, Wisconsin, United States</td>
<td>2009.0</td>
<td>33142.0</td>
<td>Red</td>
<td>Harley-Davidson</td>
<td>0</td>
<td>Touring</td>
<td>...</td>
<td>0</td>
<td>FALSE</td>
<td>100</td>
<td>0</td>
<td>2920</td>
<td>Dealer</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>11.0</td>
</tr>
<tr>
<th>4</th>
<td>Used</td>
<td>0</td>
<td>10000.0</td>
<td>West Bend, Wisconsin, United States</td>
<td>2012.0</td>
<td>17800.0</td>
<td>Blue</td>
<td>Harley-Davidson</td>
<td>NO WARRANTY</td>
<td>Touring</td>
<td>...</td>
<td>0</td>
<td>FALSE</td>
<td>100</td>
<td>13</td>
<td>271</td>
<td>OWNER</td>
<td>Clear</td>
<td>True</td>
<td>TRUE</td>
<td>0.0</td>
</tr>
</tbody>
</table>
<p>5 rows × 22 columns</p>
</div>
# 针对一个变量进行缺失值判断,使用其均值进行填补
df.Mileage.fillna(df.Mileage.mean()).head(5)
0 16000.0
1 60.0
2 25763.0
3 33142.0
4 17800.0
Name: Mileage, dtype: float64
df.columns
Index(['Condition', 'Condition_Desc', 'Price', 'Location', 'Model_Year',
'Mileage', 'Exterior_Color', 'Make', 'Warranty', 'Model', 'Sub_Model',
'Type', 'Vehicle_Title', 'OBO', 'Feedback_Perc', 'Watch_Count',
'N_Reviews', 'Seller_Status', 'Vehicle_Tile', 'Auction', 'Buy_Now',
'Bid_Count'],
dtype='object')
df[df['Exterior_Color'].isnull()].head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Condition</th>
<th>Condition_Desc</th>
<th>Price</th>
<th>Location</th>
<th>Model_Year</th>
<th>Mileage</th>
<th>Exterior_Color</th>
<th>Make</th>
<th>Warranty</th>
<th>Model</th>
<th>...</th>
<th>Vehicle_Title</th>
<th>OBO</th>
<th>Feedback_Perc</th>
<th>Watch_Count</th>
<th>N_Reviews</th>
<th>Seller_Status</th>
<th>Vehicle_Tile</th>
<th>Auction</th>
<th>Buy_Now</th>
<th>Bid_Count</th>
</tr>
</thead>
<tbody>
<tr>
<th>14</th>
<td>Used</td>
<td>NaN</td>
<td>5500.0</td>
<td>Davenport, Iowa, United States</td>
<td>2008.0</td>
<td>22102.0</td>
<td>NaN</td>
<td>Harley-Davidson</td>
<td>Vehicle does NOT have an existing warranty</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>9.3</td>
<td>NaN</td>
<td>244</td>
<td>Private Seller</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>16.0</td>
</tr>
<tr>
<th>35</th>
<td>Used</td>
<td>NaN</td>
<td>7700.0</td>
<td>Roselle, Illinois, United States</td>
<td>2007.0</td>
<td>10893.0</td>
<td>NaN</td>
<td>Harley-Davidson</td>
<td>NaN</td>
<td>Other</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>NaN</td>
<td>236</td>
<td>NaN</td>
<td>Clear</td>
<td>False</td>
<td>TRUE</td>
<td>NaN</td>
</tr>
<tr>
<th>41</th>
<td>Used</td>
<td>NaN</td>
<td>6800.0</td>
<td>Hampshire, Illinois, United States</td>
<td>2003.0</td>
<td>55782.0</td>
<td>NaN</td>
<td>Harley-Davidson</td>
<td>Vehicle does NOT have an existing warranty</td>
<td>Softail</td>
<td>...</td>
<td>NaN</td>
<td>TRUE</td>
<td>100</td>
<td>2<</td>
<td>298</td>
<td>Private Seller</td>
<td>Clear</td>
<td>False</td>
<td>TRUE</td>
<td>NaN</td>
</tr>
<tr>
<th>55</th>
<td>Used</td>
<td>NaN</td>
<td>29500.0</td>
<td>Parma, Michigan, United States</td>
<td>1950.0</td>
<td>8471.0</td>
<td>NaN</td>
<td>Harley-Davidson</td>
<td>NaN</td>
<td>Other</td>
<td>...</td>
<td>NaN</td>
<td>TRUE</td>
<td>100</td>
<td>24</td>
<td>216</td>
<td>NaN</td>
<td>Clear</td>
<td>False</td>
<td>TRUE</td>
<td>NaN</td>
</tr>
<tr>
<th>72</th>
<td>Used</td>
<td>NaN</td>
<td>6500.0</td>
<td>Bourbonnais, Illinois, United States</td>
<td>1986.0</td>
<td>55300.0</td>
<td>NaN</td>
<td>Harley-Davidson</td>
<td>NaN</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>TRUE</td>
<td>100</td>
<td>2<</td>
<td>1</td>
<td>Private Seller</td>
<td>Clear</td>
<td>False</td>
<td>TRUE</td>
<td>NaN</td>
</tr>
</tbody>
</table>
<p>5 rows × 22 columns</p>
</div>
# 求众数
df['Exterior_Color'].mode()[0]
'Black'
# 缺失用众数填补
df['Exterior_Color'].fillna(df['Exterior_Color'].mode()[0]).head(5)
0 Black
1 Black
2 Silver/Blue
3 Red
4 Blue
Name: Exterior_Color, dtype: object
df['Mileage'].median()
7083.0
# 对不同变量使用不同数据填补
# 不加inplace=True,不会对原数据生效
df.fillna(value={'Exterior_Color': df['Exterior_Color'].mode()[0],
'Mileage': df['Mileage'].median(),}).head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Condition</th>
<th>Condition_Desc</th>
<th>Price</th>
<th>Location</th>
<th>Model_Year</th>
<th>Mileage</th>
<th>Exterior_Color</th>
<th>Make</th>
<th>Warranty</th>
<th>Model</th>
<th>...</th>
<th>Vehicle_Title</th>
<th>OBO</th>
<th>Feedback_Perc</th>
<th>Watch_Count</th>
<th>N_Reviews</th>
<th>Seller_Status</th>
<th>Vehicle_Tile</th>
<th>Auction</th>
<th>Buy_Now</th>
<th>Bid_Count</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>Used</td>
<td>mint!!! very low miles</td>
<td>11412.0</td>
<td>McHenry, Illinois, United States</td>
<td>2013.0</td>
<td>16000.0</td>
<td>Black</td>
<td>Harley-Davidson</td>
<td>Unspecified</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>8.1</td>
<td>NaN</td>
<td>2427</td>
<td>Private Seller</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>28.0</td>
</tr>
<tr>
<th>1</th>
<td>Used</td>
<td>Perfect condition</td>
<td>17200.0</td>
<td>Fort Recovery, Ohio, United States</td>
<td>2016.0</td>
<td>60.0</td>
<td>Black</td>
<td>Harley-Davidson</td>
<td>Vehicle has an existing warranty</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>17</td>
<td>657</td>
<td>Private Seller</td>
<td>Clear</td>
<td>True</td>
<td>TRUE</td>
<td>0.0</td>
</tr>
<tr>
<th>2</th>
<td>Used</td>
<td>NaN</td>
<td>3872.0</td>
<td>Chicago, Illinois, United States</td>
<td>1970.0</td>
<td>25763.0</td>
<td>Silver/Blue</td>
<td>BMW</td>
<td>Vehicle does NOT have an existing warranty</td>
<td>R-Series</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>NaN</td>
<td>136</td>
<td>NaN</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>26.0</td>
</tr>
<tr>
<th>3</th>
<td>Used</td>
<td>CLEAN TITLE READY TO RIDE HOME</td>
<td>6575.0</td>
<td>Green Bay, Wisconsin, United States</td>
<td>2009.0</td>
<td>33142.0</td>
<td>Red</td>
<td>Harley-Davidson</td>
<td>NaN</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>NaN</td>
<td>2920</td>
<td>Dealer</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>11.0</td>
</tr>
<tr>
<th>4</th>
<td>Used</td>
<td>NaN</td>
<td>10000.0</td>
<td>West Bend, Wisconsin, United States</td>
<td>2012.0</td>
<td>17800.0</td>
<td>Blue</td>
<td>Harley-Davidson</td>
<td>NO WARRANTY</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>13</td>
<td>271</td>
<td>OWNER</td>
<td>Clear</td>
<td>True</td>
<td>TRUE</td>
<td>0.0</td>
</tr>
</tbody>
</table>
<p>5 rows × 22 columns</p>
</div>
# 前向填补
df['Exterior_Color'].fillna(method='ffill').tail(10)
7483 Purple
7484 Purple
7485 Black
7486 Black
7487 Gray
7488 Black
7489 Black
7490 Red
7491 TWO TONE
7492 Gray
Name: Exterior_Color, dtype: object
# 后向填补
df['Exterior_Color'].fillna(method='bfill').tail(10)
7483 Purple
7484 Black
7485 Black
7486 Black
7487 Gray
7488 Black
7489 Black
7490 Red
7491 TWO TONE
7492 Gray
Name: Exterior_Color, dtype: object
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。