赞
踩
original format:
you need to make your data available as one of 3 formats:
tf.data.Dataset
objects :tf.keras.preprocessing.image_dataset_from_directory(...)
# image files sorted into class-specific folders
main_directory/
...class_a/
......a_image_1.jpg
......a_image_2.jpg
...class_b/
......b_image_1.jpg
......b_image_2.jpg
dataset = keras.preprocessing.image_dataset_from_directory(
'path/to/main_directory', batch_size=64, image_size=(200, 200))
# For demonstration, iterate over the batches yielded by the dataset.
for data, labels in dataset:
print(data.shape) # (64, 200, 200, 3) 每批64张、200*200像素、3个RGB通道
print(data.dtype) # float32
print(labels.shape) # (64,) 每批标签64个
print(labels.dtype) # int32
keras.preprocessing.text_dataset_from_directory(...)
dataset = keras.preprocessing.text_dataset_from_directory(
'path/to/main_directory', batch_size=64)
# For demonstration, iterate over the batches yielded by the dataset.
for data, labels in dataset:
print(data.shape) # (64,)
print(data.dtype) # string
print(labels.shape) # (64,)
print(labels.dtype) # int32
tf.data.experimental.make_csv_dataset
to load structured data from CSV files.tf.data.Dataset.from_tensor_slices()
:keras: tf.data.Dataset.from_tensor_slices().as_numpy_iterator()
print(list(dataset.as_numpy_iterator()))
# [(array([1, 3], dtype=int32), array([b'A'], dtype=object)),
# (array([2, 1], dtype=int32), array([b'B'], dtype=object)),
# (array([3, 3], dtype=int32), array([b'A'], dtype=object))]
for element in dataset.as_numpy_iterator():
print(element)
# (array([1, 3], dtype=int32), array([b'A'], dtype=object))
# (array([2, 1], dtype=int32), array([b'B'], dtype=object))
# (array([3, 3], dtype=int32), array([b'A'], dtype=object))
.take(count)
:取出几批的样本。for inputs, targets in dataset.take(1):
print(inputs) # tf.Tensor([1 3], shape=(2,), dtype=int32)
print(targets) # tf.Tensor([b'A'], shape=(1,), dtype=string)
.batch()
:指定batch_size。必须指定,不然fit()
时会报错。# 指定一批32个
dataset = dataset.batch(32)
简单来说:
[0.0, 1.0]
、符合概率学(均值0和方差1)详细:
tensorflow.keras.layers.experimental.preprocessing.TextVectorization
:holds a mapping between string tokens and integer indices.
0
表示缺省值(即单词长度不够时的空单词""
),索引1
表示词汇表外的值(词汇表由adapt()
指定)。from tensorflow.keras.layers.experimental import preprocessing
vocabulary = ["aa bb cc"]
data = ["aa bb cc"]
layer = preprocessing.TextVectorization()
layer.adapt(vocabulary) # 以哪个为词汇表
normalized_data = layer(data) # 根据之前adapt()的vocabulary翻译data
print(normalized_data)
# tf.Tensor([[4 3 2 2 1 1]], shape=(1, 6), dtype=int64)
cc
,可以看到都是2
。dd
和ee
,都是1
。vocabulary
映射adapt()
时,标点符号和空格不算,只看单词。重复的单词只留一个。vocabulary
可以是一维数组["aa bb cc"]
(句子)、["aa bb", "bb cc"]
(句子)、["aa", "bb", "cc"]
(单词),不能是字符串"aa bb cc"
,不能是多列二维数组[["aa", "bb"], ["aa", "cc"]]
,但可以是单列的二维数组[["aa bb"], ["aa cc"]]
(句子)、[["aa"], ["bb"], ["cc"]]
(单词)。data
同样也是同样的格式要求,结果的形状必定是二维。注意,认为每行是一个"..."
。data = ["aa bb cc", "cc dd"]
'''
tf.Tensor(
[[2 4 3]
[3 1 0]], shape=(2, 3), dtype=int64)
'''
0
。# Example: one-hot encoded bigrams from tensorflow.keras.layers.experimental import preprocessing vocabulary = ["aa bb cc"] data = ["aa", "bb", "cc", "dd", ""] layer = preprocessing.TextVectorization(output_mode="binary", ngrams=2) layer.adapt(vocabulary) integer_data = layer(data) print(integer_data) ''' tf.Tensor( [[0. 0. 0. 0. 0. 1.] [0. 0. 0. 1. 0. 0.] [0. 1. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0.]], shape=(5, 6), dtype=float32) '''
每行都只有一个位是1,其他都是0.
tensorflow.keras.layers.experimental.preprocessing.Normalization
from tensorflow.keras.layers.experimental import preprocessing
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
normalizer = preprocessing.Normalization()
normalizer.adapt(data)
normalized_data = normalizer(data)
print(normalized_data)
'''
tf.Tensor(
[[-1.2247448 -1.2247448 -1.2247448]
[ 0. 0. 0. ]
[ 1.2247448 1.2247448 1.2247448]], shape=(3, 3), dtype=float32)
'''
# tf.keras.utils.normalize(): numpy array
normalized_data = tf.keras.utils.normalize(data)
print(normalized_data)
tensorflow.keras.layers.experimental.preprocessing.Rescaling
import numpy as np
from tensorflow.keras.layers.experimental import preprocessing
# Example image data, with values in the [0, 255] range
training_data = np.random.randint(0, 256, size=(64, 200, 200, 3)).astype("float32")
# 限定范围:从[0, 255]到[0.0, 1.0]
output_data = preprocessing.Rescaling(scale=1.0 / 255)(training_data)
如果是numpy,那么可以直接
a = np.array([25.5,255])
a = a/255
num_classes
(这里是3)必须大于等于labels的最大值+1.y
表示的类别应该是[0, MAX]
,这样恰好符合num_classes
。如果从1开始的话,虽然可以,但是创出来就是有一个从没有用到的0列。y = np.array([0, 2, 1, 2, 1]); # 三类:0 1 2
y = keras.utils.to_categorical(y, 3)
print(y)
'''
[[1. 0. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]]
'''
Categorical data preprocessing layers
Image preprocessing & augmentation layers
normalizer = preprocessing.Normalization()
normalizer.adapt(x_train)
inputs = keras.Input(shape=input_shape)
x = preprocessing_layer(inputs)
outputs = rest_of_the_model(x)
model = keras.Model(inputs, outputs)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。