
使用DISTINCT关键字去除重复行
适用于简单列去重,直接作用于SELECT后的字段:
SELECT DISTINCT column1, column2 FROM table_name;通过分组实现去重,可结合聚合函数:
SELECT column1, MAX(column2)
FROM table_name
GROUP BY column1;使用ROW_NUMBER()标记重复数据,适合复杂去重场景:
WITH deduplicated AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY column1, column2 ORDER BY id) AS rn
FROM table_name
)
SELECT column1, column2
FROM deduplicated
WHERE rn = 1;场景:同一个用户同一天可能有多条订单记录,保留最新记录:
WITH ranked_orders AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY user_id, order_date ORDER BY create_time DESC) AS rank
FROM orders
)
SELECT * FROM ranked_orders WHERE rank = 1;适合需要修改原表数据的场景:
CREATE TABLE temp_table AS
SELECT DISTINCT * FROM original_table;
TRUNCATE TABLE original_table;
INSERT INTO original_table
SELECT * FROM temp_table;
DROP TABLE temp_table;DISTINCT可能性能较差,优先考虑GROUP BY使用GROUP BY和HAVING子句可以快速识别重复记录。以下示例基于特定列查找重复值:
SELECT column_name, COUNT(*)
FROM table_name
GROUP BY column_name
HAVING COUNT(*) > 1;当需要检查多个列的组合是否重复时,只需在GROUP BY中列出所有相关列:
SELECT column1, column2, COUNT(*)
FROM table_name
GROUP BY column1, column2
HAVING COUNT(*) > 1;要查看重复记录的所有字段而不仅仅是统计信息,可以使用子查询或窗口函数:
SELECT *
FROM table_name
WHERE column_name IN (
SELECT column_name
FROM table_name
GROUP BY column_name
HAVING COUNT(*) > 1
);ROW_NUMBER()函数能更灵活地处理重复数据,特别适合标记重复项:
SELECT *,
ROW_NUMBER() OVER(PARTITION BY column_name ORDER BY id) AS row_num
FROM table_name保留单条记录并删除其他重复项的标准做法:
DELETE FROM table_name
WHERE id NOT IN (
SELECT MIN(id)
FROM table_name
GROUP BY column_name
);对于大型数据集,创建临时表处理效率更高:
CREATE TABLE temp_table AS
SELECT DISTINCT * FROM original_table;
TRUNCATE TABLE original_table;
INSERT INTO original_table
SELECT * FROM temp_table;
DROP TABLE temp_table;使用DISTINCT关键字 在SQL查询中直接使用DISTINCT可以返回唯一值记录:
SELECT DISTINCT column1, column2 FROM table_name;GROUP BY分组去重 通过分组操作实现去重,可配合聚合函数使用:
SELECT column1, column2
FROM table_name
GROUP BY column1, column2;窗口函数法(高级去重) 使用ROW_NUMBER()等窗口函数标记重复行后过滤:
WITH deduplicated AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY column1, column2 ORDER BY id) AS rn
FROM table_name
)
SELECT * FROM deduplicated WHERE rn = 1;Python列表去重 使用集合特性或字典保持顺序:
# 简单去重
unique_list = list(set(original_list))
# 保持顺序
unique_list = list(dict.fromkeys(original_list))Pandas数据框去重 DataFrame提供多种去重方式:
# 基本去重
df.drop_duplicates(subset=['col1','col2'], keep='first')
# 条件去重
df.drop_duplicates(subset=['col1'], keep=False) # 删除所有重复项Linux命令行去重 使用sort和uniq组合命令:
sort input_file.txt | uniq > output_file.txt
# 统计重复次数
sort input_file.txt | uniq -cAWK高效处理 使用AWK实现快速去重:
awk '!seen[$0]++' input_file.txt > output_file.txtHive去重查询 使用DISTRIBUTE BY和窗口函数:
SELECT * FROM (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY key_column DISTRIBUTE BY key_column) as rn
FROM large_table
) t WHERE rn = 1;Spark数据去重 通过DataFrame API处理:
df.dropDuplicates(['column1','column2'])
# 内存优化方式
df.distinct().write.mode('overwrite').parquet('output_path')相似度匹配算法 对于文本类数据可使用以下方法:
from difflib import SequenceMatcher
def similarity(a, b):
return SequenceMatcher(None, a, b).ratio()
# 应用示例
[similarity(x,y) for x,y in zip(text_list1, text_list2)]哈希值比对 生成数据指纹进行快速比对:
import hashlib
def get_hash(value):
return hashlib.md5(str(value).encode()).hexdigest()
# 创建哈希列用于比对
df['hash'] = df.apply(lambda row: get_hash(tuple(row)), axis=1)