删除数据集中的重复项是数据清洗过程中的常见操作。重复数据可能由多种原因造成,如数据录入错误、系统集成问题或ETL过程中的问题。删除重复项可以提高数据质量,减少存储空间,并提高后续分析的准确性。
import pandas as pd
# 示例数据集
data = {'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
'Age': [25, 30, 25, 35, 30],
'City': ['NY', 'LA', 'NY', 'Chicago', 'LA']}
df = pd.DataFrame(data)
# 标记重复行(保留第一个出现的记录)
duplicate_mask = df.duplicated(keep='first')
# 获取被删除的行(重复项)
deleted_rows = df[duplicate_mask]
# 删除重复项(保留第一个出现的记录)
clean_df = df.drop_duplicates(keep='first')
print("原始数据集:")
print(df)
print("\n被删除的行:")
print(deleted_rows)
print("\n清理后的数据集:")
print(clean_df)
-- 创建临时表存储重复行
CREATE TABLE deleted_duplicates AS
SELECT t1.*
FROM your_table t1
WHERE EXISTS (
SELECT 1
FROM your_table t2
WHERE t2.id != t1.id
AND t2.column1 = t1.column1
AND t2.column2 = t1.column2
-- 添加更多比较列
);
-- 删除重复行(保留最小ID的记录)
DELETE t1 FROM your_table t1
INNER JOIN (
SELECT MIN(id) as min_id, column1, column2
FROM your_table
GROUP BY column1, column2
HAVING COUNT(*) > 1
) t2
ON t1.column1 = t2.column1 AND t1.column2 = t2.column2
WHERE t1.id != t2.min_id;
import java.util.*;
import java.util.stream.Collectors;
public class RemoveDuplicates {
public static void main(String[] args) {
List<Person> persons = Arrays.asList(
new Person("Alice", 25, "NY"),
new Person("Bob", 30, "LA"),
new Person("Alice", 25, "NY"),
new Person("Charlie", 35, "Chicago"),
new Person("Bob", 30, "LA")
);
// 记录删除的行
Set<Person> uniquePersons = new HashSet<>();
List<Person> deletedRows = persons.stream()
.filter(p -> !uniquePersons.add(p))
.collect(Collectors.toList());
// 清理后的列表
List<Person> cleanList = new ArrayList<>(uniquePersons);
System.out.println("原始数据集: " + persons);
System.out.println("被删除的行: " + deletedRows);
System.out.println("清理后的数据集: " + cleanList);
}
}
class Person {
String name;
int age;
String city;
public Person(String name, int age, String city) {
this.name = name;
this.age = age;
this.city = city;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Person person = (Person) o;
return age == person.age &&
Objects.equals(name, person.name) &&
Objects.equals(city, person.city);
}
@Override
public int hashCode() {
return Objects.hash(name, age, city);
}
@Override
public String toString() {
return name + "(" + age + ", " + city + ")";
}
}
通过记录删除的行,可以保留审计跟踪,便于后续验证和问题排查。
没有搜到相关的文章