import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据预处理'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('MotorcycleData.csv', encoding='gbk', na_values='Na')
def f(x):
if '$' in str(x):
x = str(x).strip('$')
x = str(x).replace(',', '')
else:
x = str(x).replace(',', '')
return float(x)
df['Price'] = df['Price'].apply(f)
df['Mileage'] = df['Mileage'].apply(f)
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Condition</th>
<th>Condition_Desc</th>
<th>Price</th>
<th>Location</th>
<th>Model_Year</th>
<th>Mileage</th>
<th>Exterior_Color</th>
<th>Make</th>
<th>Warranty</th>
<th>Model</th>
<th>...</th>
<th>Vehicle_Title</th>
<th>OBO</th>
<th>Feedback_Perc</th>
<th>Watch_Count</th>
<th>N_Reviews</th>
<th>Seller_Status</th>
<th>Vehicle_Tile</th>
<th>Auction</th>
<th>Buy_Now</th>
<th>Bid_Count</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>Used</td>
<td>mint!!! very low miles</td>
<td>11412.0</td>
<td>McHenry, Illinois, United States</td>
<td>2013.0</td>
<td>16000.0</td>
<td>Black</td>
<td>Harley-Davidson</td>
<td>Unspecified</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>8.1</td>
<td>NaN</td>
<td>2427</td>
<td>Private Seller</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>28.0</td>
</tr>
<tr>
<th>1</th>
<td>Used</td>
<td>Perfect condition</td>
<td>17200.0</td>
<td>Fort Recovery, Ohio, United States</td>
<td>2016.0</td>
<td>60.0</td>
<td>Black</td>
<td>Harley-Davidson</td>
<td>Vehicle has an existing warranty</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>17</td>
<td>657</td>
<td>Private Seller</td>
<td>Clear</td>
<td>True</td>
<td>TRUE</td>
<td>0.0</td>
</tr>
<tr>
<th>2</th>
<td>Used</td>
<td>NaN</td>
<td>3872.0</td>
<td>Chicago, Illinois, United States</td>
<td>1970.0</td>
<td>25763.0</td>
<td>Silver/Blue</td>
<td>BMW</td>
<td>Vehicle does NOT have an existing warranty</td>
<td>R-Series</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>NaN</td>
<td>136</td>
<td>NaN</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>26.0</td>
</tr>
<tr>
<th>3</th>
<td>Used</td>
<td>CLEAN TITLE READY TO RIDE HOME</td>
<td>6575.0</td>
<td>Green Bay, Wisconsin, United States</td>
<td>2009.0</td>
<td>33142.0</td>
<td>Red</td>
<td>Harley-Davidson</td>
<td>NaN</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>NaN</td>
<td>2920</td>
<td>Dealer</td>
<td>Clear</td>
<td>True</td>
<td>FALSE</td>
<td>11.0</td>
</tr>
<tr>
<th>4</th>
<td>Used</td>
<td>NaN</td>
<td>10000.0</td>
<td>West Bend, Wisconsin, United States</td>
<td>2012.0</td>
<td>17800.0</td>
<td>Blue</td>
<td>Harley-Davidson</td>
<td>NO WARRANTY</td>
<td>Touring</td>
<td>...</td>
<td>NaN</td>
<td>FALSE</td>
<td>100</td>
<td>13</td>
<td>271</td>
<td>OWNER</td>
<td>Clear</td>
<td>True</td>
<td>TRUE</td>
<td>0.0</td>
</tr>
</tbody>
</table>
<p>5 rows × 22 columns</p>
</div>
# 对价格异常值处理
# 计算价格均值
x_bar = df['Price'].mean()
# 计算价格标准差
x_std = df['Price'].std()
# 异常值上限检测
any(df['Price'] > x_bar + 2 * x_std)
True
# 异常值下限检测
any(df['Price'] < x_bar - 2 * x_std)
False
# 描述性统计
df['Price'].describe()
count 7493.000000
mean 9968.811557
std 8497.326850
min 0.000000
25% 4158.000000
50% 7995.000000
75% 13000.000000
max 100000.000000
Name: Price, dtype: float64
# 25% 分位数
Q1 = df['Price'].quantile(q = 0.25)
# 75% 分位数
Q3 = df['Price'].quantile(q = 0.75)
# 分位差
IQR = Q3 - Q1
any(df['Price'] > Q3 + 1.5 * IQR)
True
any(df['Price'] < Q1 - 1.5 * IQR)
False
import matplotlib.pyplot as plt
%matplotlib inline
df['Price'].plot(kind='box')
<matplotlib.axes._subplots.AxesSubplot at 0x11ddad20ac8>
# 设置绘图风格
plt.style.use('seaborn')
# 绘制直方图
df.Price.plot(kind='hist', bins=30, density=True)
# 绘制核密度图
df.Price.plot(kind='kde')
# 图形展现
plt.show()
# 用99分位数和1分位数替换
# 计算P1和P99
P99 = df['Price'].quantile(q=0.99)
P1 = df['Price'].quantile(q=0.01)
P99
39995.32
df['Price_new'] = df['Price']
# 盖帽法
df.loc[df['Price'] > P99, 'Price_new'] = P99
df.loc[df['Price'] < P1, 'Price_new'] = P1
df[['Price', 'Price_new']].describe()
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Price</th>
<th>Price_new</th>
</tr>
</thead>
<tbody>
<tr>
<th>count</th>
<td>7493.000000</td>
<td>7493.000000</td>
</tr>
<tr>
<th>mean</th>
<td>9968.811557</td>
<td>9821.220873</td>
</tr>
<tr>
<th>std</th>
<td>8497.326850</td>
<td>7737.092537</td>
</tr>
<tr>
<th>min</th>
<td>0.000000</td>
<td>100.000000</td>
</tr>
<tr>
<th>25%</th>
<td>4158.000000</td>
<td>4158.000000</td>
</tr>
<tr>
<th>50%</th>
<td>7995.000000</td>
<td>7995.000000</td>
</tr>
<tr>
<th>75%</th>
<td>13000.000000</td>
<td>13000.000000</td>
</tr>
<tr>
<th>max</th>
<td>100000.000000</td>
<td>39995.320000</td>
</tr>
</tbody>
</table>
</div>
# df['Price_new'].plot(kind='box')
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。