这就是我认为人们可能会遇到的一个常见问题。假设我有一个想要与某人共享的数据集,但我不想透露产品名称等。在这个数据集的情况下,假设我想随机化列的值: part、site、cust_id、cust_name、region、forecast_group,最后是market。如何有效地做到这一点?
我希望保留原始列值和修改后的列值之间的关联,以便可以将其转换回来。几乎类似于消息收发软件可以使用的消息id和复制ID。
所需的输出是什么?具有具有某些随机值的列值的相同数据帧(最好具有与原始值相同的长度),以及原始值和新的随机值之间的某种关联。
我在下面放了一个dput格式的示例数据。我希望这是提供样本数据集的正确方式。
structure(list(date = c("04-01-17", "04-01-17", "04-01-17", "04-01-17",
"04-01-17", "04-01-17", "04-01-17", "04-01-17", "04-01-17", "04-01-17",
"04-01-17", "04-01-17", "04-01-17", "04-01-17", "04-01-17", "04-01-17",
"04-01-17", "04-01-17", "04-01-17", "04-01-17"), part = c("18423",
"24643", "24644", "27027", "27028", "29309", "324470-0010", "324470-0010",
"324470-0010", "324470-0010", "329509-1300", "329559-0010", "329559-0020",
"329559-0020", "329559-0030", "329583-0010", "34103", "34104",
"34104", "34104"), site = c("ERDR", "ERNF", "EDXC", "EDXC", "EDXC",
"EDXC", "EDXC", "EDXC", "ERDR", "ERDR", "EDXC", "ERDR", "EDXC",
"ERDR", "ERDR", "EDXC", "ERDR", "EDXC", "ERNF", "ERDR"), family = c("HED00028",
"HED00036", "HED00036", "HED00024", "HED00024", "HED00027", "CED00010",
"CED00010", "CED00010", "CED00010", "HED00276", "NRT00006", "NRT00006",
"NRT00006", "NRT00006", "CED00010", "HED00035", "HED00035", "HED00035",
"HED00035"), cust_id = c("E-Commerce_Americas", "902_4000", "E-Commerce_Americas",
"E-Commerce_Americas", "944_4000", "E-Commerce_Canada", "E-Commerce_Americas",
"E-Commerce_Canada", "E-Commerce_Americas", "E-Commerce_Canada",
"E-Commerce_Americas", "E-Commerce_Americas", "E-Commerce_Canada",
"E-Commerce_Americas", "E-Commerce_Americas", "E-Commerce_Canada",
"E-Commerce_Americas", "E-Commerce_Americas", "902_4000", "E-Commerce_Americas"
), cust_name = c("E-Commerce Americas", "Mexico", "E-Commerce Americas",
"E-Commerce Americas", "Americas National", "E-Commerce Canada",
"E-Commerce Americas", "E-Commerce Canada", "E-Commerce Americas",
"E-Commerce Canada", "E-Commerce Americas", "E-Commerce Americas",
"E-Commerce Canada", "E-Commerce Americas", "E-Commerce Americas",
"E-Commerce Canada", "E-Commerce Americas", "E-Commerce Americas",
"Mexico", "E-Commerce Americas"), region = c("AMERICAS", "AMERICAS",
"AMERICAS", "AMERICAS", "AMERICAS", "AMERICAS", "AMERICAS", "AMERICAS",
"AMERICAS", "AMERICAS", "AMERICAS", "AMERICAS", "AMERICAS", "AMERICAS",
"AMERICAS", "AMERICAS", "AMERICAS", "AMERICAS", "AMERICAS", "AMERICAS"
), forecast_group = c("E-Commerce_Americas", "All_Other_Wholesale_Americas",
"E-Commerce_Americas", "E-Commerce_Americas", "All_Other_Wholesale_Americas",
"E-Commerce_Americas", "E-Commerce_Americas", "E-Commerce_Americas",
"E-Commerce_Americas", "E-Commerce_Americas", "E-Commerce_Americas",
"E-Commerce_Americas", "E-Commerce_Americas", "E-Commerce_Americas",
"E-Commerce_Americas", "E-Commerce_Americas", "E-Commerce_Americas",
"E-Commerce_Americas", "All_Other_Wholesale_Americas", "E-Commerce_Americas"
), market = c("AM", "AM", "AM", "AM", "AM", "AM", "AM", "AM",
"AM", "AM", "AM", "AM", "AM", "AM", "AM", "AM", "AM", "AM", "AM",
"AM"), quantity = c("2", "2", "3", "1", "2", "1", "30", "2",
"18", "2", "1", "1", "4", "6", "3", "1", "1", "2", "1", "1"),
abc_code = c("EOL", "C", "C", "EOL", "EOL", "EOL", "C", "C",
"C", "C", "EOL", "EOL", "EOL", "EOL", "EOL", "EOL", "EOL",
"EOL", "EOL", "EOL")), row.names = c(NA, -20L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000024b01af1ef0>)
发布于 2021-08-11 01:14:51
我们可以在.SDcols
中指定感兴趣的列,循环遍历这些列,应用anonymize
函数,使用substr
返回前6个字符,然后将其分配给通过将前缀'anon_‘附加到原始列名而创建的新列
library(data.table)
#devtools::install_github("paulhendricks/anonymizer")
library(anonymizer)
df2 <- copy(df1)
nm1 <- c("part", "site", "family", "cust_id", "cust_name", "region",
"forecast_group", "market", "quantity")
anon_nm1 <- paste0('anon_', nm1)
df2[, (anon_nm1) := lapply(.SD, function(x) substr(anonymize(x), 1, 6)), .SDcols = nm1]
-output
> head(df2)
date part site family cust_id cust_name region forecast_group market quantity abc_code anon_part
1: 04-01-17 18423 ERDR HED00028 E-Commerce_Americas E-Commerce Americas AMERICAS E-Commerce_Americas AM 2 EOL 6a57ed
2: 04-01-17 24643 ERNF HED00036 902_4000 Mexico AMERICAS All_Other_Wholesale_Americas AM 2 C e81170
3: 04-01-17 24644 EDXC HED00036 E-Commerce_Americas E-Commerce Americas AMERICAS E-Commerce_Americas AM 3 C a34407
4: 04-01-17 27027 EDXC HED00024 E-Commerce_Americas E-Commerce Americas AMERICAS E-Commerce_Americas AM 1 EOL 0b2bd4
5: 04-01-17 27028 EDXC HED00024 944_4000 Americas National AMERICAS All_Other_Wholesale_Americas AM 2 EOL d91428
6: 04-01-17 29309 EDXC HED00027 E-Commerce_Canada E-Commerce Canada AMERICAS E-Commerce_Americas AM 1 EOL b9067e
anon_site anon_family anon_cust_id anon_cust_name anon_region anon_forecast_group anon_market anon_quantity
1: 406c1d ec7a16 603d29 eb0da1 e6943d 603d29 df0785 de855e
2: 4ca838 740420 8cb50d fe4c6a e6943d 16947f df0785 de855e
3: 396de1 740420 603d29 eb0da1 e6943d 603d29 df0785 c170da
4: 396de1 7ff174 603d29 eb0da1 e6943d 603d29 df0785 51f2e9
5: 396de1 7ff174 3e8856 0aefe7 e6943d 16947f df0785 de855e
6: 396de1 bfbe64 50fcbf ef1088 e6943d 603d29 df0785 51f2e9
https://stackoverflow.com/questions/68734824
复制相似问题