Tensorflow实现将标签变为one-hot形式

砸漏

发布于 2020-11-02 14:36:58

1.3K0

发布于 2020-11-02 14:36:58

文章被收录于专栏：恩蓝脚本

将数据标签变为类似MNIST的one-hot编码形式

def one_hot(indices, 
depth, 
on_value=None, 
off_value=None, 
axis=None, 
dtype=None, 
name=None):
"""Returns a one-hot tensor.
The locations represented by indices in `indices` take value 
`on_value`,
while all other locations take value `off_value`.
`on_value` and `off_value` must have matching data types. If 
`dtype` is also
provided, they must be the same data type as specified by 
`dtype`.
If `on_value` is not provided, it will default to the value `1` with 
type
`dtype`
If `off_value` is not provided, it will default to the value `0` with 
type
`dtype`
If the input `indices` is rank `N`, the output will have rank 
`N+1`. The
new axis is created at dimension `axis` (default: the new axis is 
appended
at the end).
If `indices` is a scalar the output shape will be a vector of 
length `depth`
If `indices` is a vector of length `features`, the output shape will 
be:
```
features x depth if axis == -1
depth x features if axis == 0
```
If `indices` is a matrix (batch) with shape `[batch, features]`, the 
output
shape will be:
```
batch x features x depth if axis == -1
batch x depth x features if axis == 1
depth x batch x features if axis == 0
```
If `dtype` is not provided, it will attempt to assume the data 
type of
`on_value` or `off_value`, if one or both are passed in. If none 
of
`on_value`, `off_value`, or `dtype` are provided, `dtype` will 
default to the
value `tf.float32`.
Note: If a non-numeric data type output is desired (`tf.string`, 
`tf.bool`,
etc.), both `on_value` and `off_value` _must_ be provided to 
`one_hot`.
For example:
```python
indices = [0, 1, 2]
depth = 3
tf.one_hot(indices, depth) # output: [3 x 3]
# [[1., 0., 0.],
# [0., 1., 0.],
# [0., 0., 1.]]
indices = [0, 2, -1, 1]
depth = 3
tf.one_hot(indices, depth,
on_value=5.0, off_value=0.0,
axis=-1) # output: [4 x 3]
# [[5.0, 0.0, 0.0], # one_hot(0)
# [0.0, 0.0, 5.0], # one_hot(2)
# [0.0, 0.0, 0.0], # one_hot(-1)
# [0.0, 5.0, 0.0]] # one_hot(1)
indices = [[0, 2], [1, -1]]
depth = 3
tf.one_hot(indices, depth,
on_value=1.0, off_value=0.0,
axis=-1) # output: [2 x 2 x 3]
# [[[1.0, 0.0, 0.0], # one_hot(0)
# [0.0, 0.0, 1.0]], # one_hot(2)
# [[0.0, 1.0, 0.0], # one_hot(1)
# [0.0, 0.0, 0.0]]] # one_hot(-1)
```
Args:
indices: A `Tensor` of indices.
depth: A scalar defining the depth of the one hot dimension.
on_value: A scalar defining the value to fill in output when 
`indices[j]
= i`. (default: 1)
off_value: A scalar defining the value to fill in output when 
`indices[j]
!= i`. (default: 0)
axis: The axis to fill (default: -1, a new inner-most axis).
dtype: The data type of the output tensor.
Returns:
output: The one-hot tensor.
Raises:
TypeError: If dtype of either `on_value` or `off_value` don't 
match `dtype`
TypeError: If dtype of `on_value` and `off_value` don't match 
one another
"""
with ops.name_scope(name, "one_hot", 
[indices, depth, on_value, off_value, axis, 
dtype]) as name:
on_exists = on_value is not None
off_exists = off_value is not None
on_dtype = ops.convert_to_tensor(on_value).dtype.base_dtype 
if on_exists else None
off_dtype = ops.convert_to_tensor(off_value).dtype.
base_dtype if off_exists else None
if on_exists or off_exists:
if dtype is not None:
# Ensure provided on_value and/or off_value match dtype
if (on_exists and on_dtype != dtype):
raise TypeError("dtype {0} of on_value does not match "
"dtype parameter {1}".format(on_dtype, dtype))
if (off_exists and off_dtype != dtype):
raise TypeError("dtype {0} of off_value does not match "
"dtype parameter {1}".format(off_dtype, dtype))
else:
# dtype not provided: automatically assign it
dtype = on_dtype if on_exists else off_dtype
elif dtype is None:
# None of on_value, off_value, or dtype provided. Default 
dtype to float32
dtype = dtypes.float32
if not on_exists:
# on_value not provided: assign to value 1 of type dtype
on_value = ops.convert_to_tensor(1, dtype, name="
on_value")
on_dtype = dtype
if not off_exists:
# off_value not provided: assign to value 0 of type dtype
off_value = ops.convert_to_tensor(0, dtype, name="
off_value")
off_dtype = dtype
if on_dtype != off_dtype:
raise TypeError("dtype {0} of on_value does not match "
"dtype {1} of off_value".format(on_dtype, off_dtype))
return gen_array_ops._one_hot(indices, depth, on_value, 
off_value, axis, 
name)
Enter: apply completion.
+ Ctrl: remove arguments and replace current word (no Pop-
up focus).
+ Shift: remove arguments (requires Pop-up focus).

import tensorflow as tf
import numpy as np
data = np.linspace(0,9,10)
label = tf.one_hot(data,10)
with tf.Session() as sess:
print(data)
print(sess.run(label))

补充知识：数据清洗—制作one-hot

使用pandas进行one-hot编码

pandas.get_dummies(data, prefix=None, prefix_sep=’_’, dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

pandas中get_dummies()函数可以将字段进行编码，转换为01形式，其中prefix可以为每个新展开的列名添加前缀。

但是，笔者发现它较易使用在数据为每一列为单独的字符：

df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]})
## one-hot
df_dumm = pd.get_dummies(df)

my_one_hot

但是对于数据为下面形式的可就不能直接转换了，需要先预处理一下，之后转换为one-hot形式：

我的做法是:

## tqdm_notebook可以导入tqdm包来使用
def one_hot_my(dataframe, attri):
sample_attri_list = []
sample_attri_loc_dic = {}
loc = 0
dataframe[attri] = dataframe[attri].astype(str)
for attri_id in tqdm_notebook(dataframe[attri]):
attri_id_pro = attri_id.strip().split(',')
for key in attri_id_pro:
if key not in sample_attri_loc_dic.keys():
sample_attri_loc_dic[key] = loc
loc+=1
sample_attri_list.append(attri_id_pro)
print("开始完成one-hot.......")  
one_hot_attri = []
for attri_id in tqdm_notebook(sample_attri_list):
array = [0 for _ in range(len(sample_attri_loc_dic.keys()))]
for key in attri_id:
array[sample_attri_loc_dic[key]] = 1
one_hot_attri.append(array)
print("封装成dataframe.......") 
## 封装成dataframe
columns = [attri+x for x in sample_attri_loc_dic.keys()]
one_hot_rig_id_df = pd.DataFrame(one_hot_attri,columns=columns)
return one_hot_rig_id_df

对属性二值化可以采用：

## 对属性进行二值化
def binary_apply(key, attri, dataframe):
key_modify = 'is_' + ''.join(lazy_pinyin(key)) + '_' + attri
print(key_modify)
dataframe[key_modify] = dataframe.apply(lambda x:1 if x[attri]== key else 0, axis=1)
return dataframe

对字符进行编码,将字符转换为0，1，2…：

## 对字符进行编码
# columns = ['job', 'marital', 'education','default','housing' ,'loan','contact', 'poutcome']
def encode_info(dataframe, columns):
for col in columns:
print(col)
dataframe[col] = pd.factorize(dataframe[col])[0]
return dataframe