赞
踩
目录
- import tensorflow as tf
- AUTOTUNE = tf.data.experimental.AUTOTUNE
使用tf.data加载图片
- # 下载数据集
- import pathlib
- data_root_orig = tf.keras.utils.get_file(origin='https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
- fname='flower_photos', untar=True)
- data_root = pathlib.Path(data_root_orig)
- print(data_root)
-
- '''
- Downloading data from https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
- 228813984/228813984 [==============================] - 3s 0us/step
- /home/kbuilder/.keras/datasets/flower_photos
- '''
-
- # 查看数据集
- for item in data_root.iterdir():
- print(item)
-
- '''
- /home/kbuilder/.keras/datasets/flower_photos/daisy
- /home/kbuilder/.keras/datasets/flower_photos/dandelion
- /home/kbuilder/.keras/datasets/flower_photos/sunflowers
- /home/kbuilder/.keras/datasets/flower_photos/roses
- /home/kbuilder/.keras/datasets/flower_photos/LICENSE.txt
- /home/kbuilder/.keras/datasets/flower_photos/tulips
- '''
- # 1. 打乱数据集
- import random
- all_image_paths = list(data_root.glob('*/*'))
- all_image_paths = [str(path) for path in all_image_paths]
- random.shuffle(all_image_paths)
-
- image_count = len(all_image_paths)
- image_count
- # 3670
-
- # 检查图片
- import os
- attributions = (data_root/"LICENSE.txt").open(encoding='utf-8').readlines()[4:]
- attributions = [line.split(' CC-BY') for line in attributions]
- attributions = dict(attributions)
-
- import IPython.display as display
- def caption_image(image_path):
- image_rel = pathlib.Path(image_path).relative_to(data_root)
- return "Image (CC BY 2.0) " + ' - '.join(attributions[str(image_rel)].split(' - ')[:-1])
-
- for n in range(3):
- image_path = random.choice(all_image_paths)
- display.display(display.Image(image_path))
- print(caption_image(image_path))
- print()
-
- # 2. 确定每张图片的标签
- # 2.1 列出可用的标签
- label_names = sorted(item.name for item in data_root.glob('*/') if item.is_dir())
- label_names
- # ['daisy', 'dandelion', 'roses', 'sunflowers', 'tulips']
-
- # 2.2 为每个标签分配索引
- label_to_index = dict((name, index) for index, name in enumerate(label_names))
- label_to_index
- # {'daisy': 0, 'dandelion': 1, 'roses': 2, 'sunflowers': 3, 'tulips': 4}
-
- # 2.3 创建一个列表包含每个文件的标签索引
- all_image_labels = [label_to_index[pathlib.Path(path).parent.name]
- for path in all_image_paths]
-
- print("First 10 labels indices: ", all_image_labels[:10])
- # First 10 labels indices: [4, 2, 4, 1, 1, 2, 4, 4, 3, 2]
-
- # 3. 加载和格式化图片
- # 3.1 取第一张图片
- img_path = all_image_paths[0]
- img_path
- # '/home/kbuilder/.keras/datasets/flower_photos/tulips/14099204939_60e6ffa4c3_n.jpg'
-
- # 查看原始数据
- img_raw = tf.io.read_file(img_path)
- print(repr(img_raw)[:100]+"...")
- # <tf.Tensor: shape=(), dtype=string, numpy=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00...
-
- # 3.2 将原始数据转换为图像tensor
- img_tensor = tf.image.decode_image(img_raw)
-
- print(img_tensor.shape)
- print(img_tensor.dtype)
-
- '''
- (212, 320, 3)
- <dtype: 'uint8'>
- '''
-
- # 3.3 根据模型调整图像tensor大小
- img_final = tf.image.resize(img_tensor, [192, 192])
- img_final = img_final/255.0
- print(img_final.shape)
- print(img_final.numpy().min())
- print(img_final.numpy().max())
-
- '''
- (192, 192, 3)
- 0.0
- 1.0
- '''
-
- # 3.4 将以上操作封装在一个函数中
- def preprocess_image(image):
- image = tf.image.decode_jpeg(image, channels=3)
- image = tf.image.resize(image, [192, 192])
- image /= 255.0 # normalize to [0,1] range
-
- return image
-
- def load_and_preprocess_image(path):
- image = tf.io.read_file(path)
- return preprocess_image(image)
-
- # 应用封装好的函数
- import matplotlib.pyplot as plt
-
- image_path = all_image_paths[0]
- label = all_image_labels[0]
-
- plt.imshow(load_and_preprocess_image(img_path))
- plt.grid(False)
- plt.xlabel(caption_image(img_path))
- plt.title(label_names[label].title())
- print()
构建一个tf.data.Dataset:
使用from_tensor_slices方法可以简单地构建一个tf.data.Dataset
- # 1. 构建字符串数据集path_ds
- # 将字符串数组切片得到一个字符串数据集path_ds
- path_ds = tf.data.Dataset.from_tensor_slices(all_image_paths)
-
- print(path_ds)
- # <TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>
-
- # 2. 加载并格式化图片,构建图片数据集image_ds
- # 创建一个新的图像数据集image_ds,通过在路径数据集上映射 preprocess_image 来动态加载和格式化图片
- image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
-
- import matplotlib.pyplot as plt
-
- plt.figure(figsize=(8,8))
- for n, image in enumerate(image_ds.take(4)):
- plt.subplot(2,2,n+1)
- plt.imshow(image)
- plt.grid(False)
- plt.xticks([])
- plt.yticks([])
- plt.xlabel(caption_image(all_image_paths[n]))
- plt.show()
-
- # 3. 创建标签数据集label_ds
- label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(all_image_labels, tf.int64))
-
- for label in label_ds.take(10):
- print(label_names[label.numpy()])
-
- # 4. 构建(图片,标签)对数据集image_label_ds
- image_label_ds = tf.data.Dataset.zip((image_ds, label_ds))
-
- print(image_label_ds)
- # <ZipDataset element_spec=(TensorSpec(shape=(192, 192, 3), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>
准备训练数据:
- # 1. 准备训练数据
- BATCH_SIZE = 32
-
- # 设置一个和数据集大小一致的 shuffle buffer size(随机缓冲区大小)以保证数据被充分打乱
- # 当模型在训练的时候,`prefetch` 使数据集在后台取得 batch,使用额外的内存减少延迟时间。
- ds = image_label_ds.shuffle(buffer_size=image_count) # ds每次输出是不确定的
- ds = ds.repeat() # ds每次repeat产生不同的结果
- ds = ds.batch(BATCH_SIZE) # 将ds分成batch
- ds = ds.prefetch(buffer_size=AUTOTUNE) # 将ds的batch存储好
- ds
- # <PrefetchDataset element_spec=(TensorSpec(shape=(None, 192, 192, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>
1. 顺序很重要:
2. 在完全打乱中使用较大的缓冲区大小提供更好的随机化但使用更多的内存。
3. 在从随机缓冲区中拉取任何元素前,要先填满它。所以当 Dataset(数据集)启动的时候一个大的 buffer_size(缓冲区大小)可能会引起延迟。
4. 在随机缓冲区完全为空之前,被打乱的数据集不会报告数据集的结尾。Dataset(数据集)由 .repeat 重新启动,导致需要再次等待随机缓冲区被填满。
构建并训练模型:
- # 2. 构建模型
- # 2.1 使用 MobileNet v2 模型副本进行训练
- mobile_net = tf.keras.applications.MobileNetV2(input_shape=(192, 192, 3), include_top=False)
-
- # 设置 MobileNet v2 权重不可训练
- mobile_net.trainable=False
-
- # 将模型输出标准化至 [-1,1] 范围内
- def change_range(image,label):
- return 2*image-1, label
-
- keras_ds = ds.map(change_range)
-
- # MobileNet v2 为每张图片的特征返回一个6*6的空间网格
- # 数据集可能需要几秒来启动,因为要填满其随机缓冲区,传递一个batch的图片
- image_batch, label_batch = next(iter(keras_ds))
-
- feature_map_batch = mobile_net(image_batch)
- print(feature_map_batch.shape)
- # (32, 6, 6, 1280)
- # 每个batch有32条样本,每个样本返回个6*6的空间网格
-
- # 2.2 构建模型
- model = tf.keras.Sequential([
- mobile_net,
- tf.keras.layers.GlobalAveragePooling2D(), # 平均空间向量
- tf.keras.layers.Dense(len(label_names), activation = 'softmax')])
-
- logit_batch = model(image_batch).numpy()
-
- print("min logit:", logit_batch.min())
- print("max logit:", logit_batch.max())
- print()
-
- print("Shape:", logit_batch.shape)
-
- '''
- min logit: 0.014231807
- max logit: 0.7678226
- Shape: (32, 5)
- '''
-
- model.compile(optimizer=tf.keras.optimizers.Adam(),
- loss='sparse_categorical_crossentropy',
- metrics=["accuracy"])
-
- # 2.3 查看模型结构
- # 此时可训练的变量为dense层中的weight和bias
- model.summary()
-
- '''
- Model: "sequential"
- _________________________________________________________________
- Layer (type) Output Shape Param #
- =================================================================
- mobilenetv2_1.00_192 (Funct (None, 6, 6, 1280) 2257984
- ional)
-
- global_average_pooling2d (G (None, 1280) 0
- lobalAveragePooling2D)
-
- dense (Dense) (None, 5) 6405
-
- =================================================================
- Total params: 2,264,389
- Trainable params: 6,405
- Non-trainable params: 2,257,984
- _________________________________________________________________
- '''
-
- # 2.4 训练模型
- # 出于演示目的每一个 epoch 只运行 3 step,在传递给 model.fit() 之前指定 step 的数量
- steps_per_epoch=tf.math.ceil(len(all_image_paths)/BATCH_SIZE).numpy()
- steps_per_epoch
- # 115.0
-
- model.fit(ds, epochs=1, steps_per_epoch=3)
使用泰坦尼克号乘客的数据,模型会根据乘客的年龄、性别、票务舱和是否独自旅行等特征来预测乘客生还的可能性。
- import functools
-
- import numpy as np
- import tensorflow as tf
- import tensorflow_datasets as tfds
-
- TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
- TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
-
- train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
- test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)
-
- # 让 numpy 数据更易读
- np.set_printoptions(precision=3, suppress=True)
查看csv文件了解文件格式:
- head {train_file_path}
-
- '''
- survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
- 0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
- 1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
- 1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
- 1,female,35.0,1,0,53.1,First,C,Southampton,n
- 0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
- 0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
- 1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
- 1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
- 1,female,4.0,1,1,16.7,Third,G,Southampton,n
- '''
CSV 文件的每列都会有一个列名,dataset 的构造函数会自动识别这些列名。如果文件的第一行不包含列名,那么需要将列名通过字符串列表传给 make_csv_dataset 函数的 column_names 参数。
- CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']
-
- dataset = tf.data.experimental.make_csv_dataset(
- ...,
- column_names=CSV_COLUMNS,
- ...)
这个示例使用了所有的列。如果你需要忽略数据集中的某些列,创建一个包含你需要使用的列的列表,然后传给构造器的(可选)参数 select_columns。
- dataset = tf.data.experimental.make_csv_dataset(
- ...,
- select_columns = columns_to_use,
- ...)
指定标签所在的列:
- LABEL_COLUMN = 'survived'
- LABELS = [0, 1]
读取CSV数据并创建dataset:
- def get_dataset(file_path):
- dataset = tf.data.experimental.make_csv_dataset(
- file_path,
- batch_size=12, # 为了示例更容易展示,手动设置较小的值
- label_name=LABEL_COLUMN,
- na_value="?",
- num_epochs=1,
- ignore_errors=True)
- return dataset
-
- raw_train_data = get_dataset(train_file_path)
- raw_test_data = get_dataset(test_file_path)
dataset 中的每个条目都是一个批次,用一个元组(多个样本,多个标签)表示。样本中的数据组织形式是以列为主的张量(而不是以行为主的张量),每条数据中包含的元素个数就是批次大小(这个示例中是 12)。
- examples, labels = next(iter(raw_train_data)) # 第一个批次
- print("EXAMPLES: \n", examples, "\n")
- print("LABELS: \n", labels)
-
- '''
- EXAMPLES:
- OrderedDict([('sex', <tf.Tensor: shape=(12,), dtype=string, numpy=
- array([b'female', b'female', b'male', b'male', b'female', b'male',
- b'female', b'male', b'female', b'male', b'female', b'male'],
- dtype=object)>), ('age', <tf.Tensor: shape=(12,), dtype=float32, numpy=
- array([28., 41., 28., 24., 63., 28., 28., 65., 34., 9., 27., 30.],
- dtype=float32)>), ...])
- LABELS:
- tf.Tensor([1 0 0 0 1 0 0 0 1 0 1 0], shape=(12,), dtype=int32)
- '''
离散数据:对于有些分类的特征(即有些列只能在有限的集合中取值),使用 tf.feature_column API 创建一个 tf.feature_column.indicator_column 集合,每个tf.feature_column.indicator_column 对应一个分类的列。
- CATEGORIES = {
- 'sex': ['male', 'female'],
- 'class' : ['First', 'Second', 'Third'],
- 'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
- 'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
- 'alone' : ['y', 'n']
- }
-
- categorical_columns = []
- for feature, vocab in CATEGORIES.items():
- cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
- key=feature, vocabulary_list=vocab)
- categorical_columns.append(tf.feature_column.indicator_column(cat_col))
-
- # 查看刚才创建的内容
- categorical_columns
-
- '''
- [IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
- IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
- IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
- IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Southhampton', 'Queenstown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
- IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('y', 'n'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]
- '''
连续数据:需要对离散数据进行标准化
- def process_continuous_data(mean, data):
- # 标准化数据
- data = tf.cast(data, tf.float32) * 1/(2*mean)
- return tf.reshape(data, [-1, 1])
-
- MEANS = {
- 'age' : 29.631308,
- 'n_siblings_spouses' : 0.545455,
- 'parch' : 0.379585,
- 'fare' : 34.385399
- }
-
- numerical_columns = []
-
- for feature in MEANS.keys():
- num_col = tf.feature_column.numeric_column(feature, normalizer_fn=functools.partial(process_continuous_data, MEANS[feature]))
- numerical_columns.append(num_col)
-
- # 查看创建的内容
- numerical_columns
-
- '''
- [NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function process_continuous_data at 0x7f74fc4f8c10>, 29.631308)),
- NumericColumn(key='n_siblings_spouses', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function process_continuous_data at 0x7f74fc4f8c10>, 0.545455)),
- NumericColumn(key='parch', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function process_continuous_data at 0x7f74fc4f8c10>, 0.379585)),
- NumericColumn(key='fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function process_continuous_data at 0x7f74fc4f8c10>, 34.385399))]
- '''
将离散数据和连续数据的集合合并在一起,并传递给tf.keras.layers.DenseFeatures创建一个进行预处理的输入层。
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numerical_columns)
- model = tf.keras.Sequential([
- preprocessing_layer, # 预处理层
- tf.keras.layers.Dense(128, activation='relu'), # 全连接层
- tf.keras.layers.Dense(128, activation='relu'), # 全连接层
- tf.keras.layers.Dense(1, activation='sigmoid'), # 输出层
- ])
-
- model.compile(
- loss='binary_crossentropy',
- optimizer='adam',
- metrics=['accuracy'])
- # 1. 训练模型
- train_data = raw_train_data.shuffle(500)
- test_data = raw_test_data
-
- model.fit(train_data, epochs=20)
-
- # 2. 评估模型
- test_loss, test_accuracy = model.evaluate(test_data)
-
- print('\n\nTest Loss {}, Test Accuracy {}'.format(test_loss, test_accuracy))
- # Test Loss 0.4481576979160309, Test Accuracy 0.8030303120613098
-
- # 3. 进行预测
- predictions = model.predict(test_data)
-
- # 显示前十个batch结果
- for prediction, survived in zip(predictions[:10], list(test_data)[0][1][:10]):
- print("Predicted survival: {:.2%}".format(prediction[0]),
- " | Actual outcome: ",
- ("SURVIVED" if bool(survived) else "DIED"))
-
- '''
- 22/22 [==============================] - 0s 4ms/step
- Predicted survival: 90.08% | Actual outcome: SURVIVED
- Predicted survival: 0.97% | Actual outcome: SURVIVED
- Predicted survival: 0.98% | Actual outcome: DIED
- Predicted survival: 10.06% | Actual outcome: SURVIVED
- Predicted survival: 62.49% | Actual outcome: DIED
- Predicted survival: 62.38% | Actual outcome: SURVIVED
- Predicted survival: 11.18% | Actual outcome: SURVIVED
- Predicted survival: 60.31% | Actual outcome: SURVIVED
- Predicted survival: 9.72% | Actual outcome: DIED
- Predicted survival: 21.93% | Actual outcome: DIED
- '''
- import numpy as np
- import tensorflow as tf
- # 从.npz文件中加载数据
- DATA_URL = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz'
-
- path = tf.keras.utils.get_file('mnist.npz', DATA_URL)
- with np.load(path) as data:
- train_examples = data['x_train']
- train_labels = data['y_train']
- test_examples = data['x_test']
- test_labels = data['y_test']
-
- # 创建数据集
- train_dataset = tf.data.Dataset.from_tensor_slices((train_examples, train_labels))
- test_dataset = tf.data.Dataset.from_tensor_slices((test_examples, test_labels))
- # shuffle and batch
- BATCH_SIZE = 64
- SHUFFLE_BUFFER_SIZE = 100
-
- train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
- test_dataset = test_dataset.batch(BATCH_SIZE)
-
-
- # 构建并训练模型
- model = tf.keras.Sequential([
- tf.keras.layers.Flatten(input_shape=(28, 28)),
- tf.keras.layers.Dense(128, activation='relu'),
- tf.keras.layers.Dense(10)
- ])
-
- model.compile(optimizer=tf.keras.optimizers.RMSprop(),
- loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
- metrics=['sparse_categorical_accuracy'])
-
- model.fit(train_dataset, epochs=10)
使用由克利夫兰诊所心脏病基金会(Cleveland Clinic Foundation for Heart Disease)提供的小型数据集,此数据集中有几百行CSV。每行表示一个患者,每列表示一个属性(describe)。我们将使用这些信息来预测患者是否患有心脏病,这是一个二分类问题。
- !pip install tensorflow-gpu==2.0.0-rc1
- import pandas as pd
- import tensorflow as tf
- # 下载数据集CSV文件
- csv_file = tf.keras.utils.get_file('heart.csv', 'https://storage.googleapis.com/applied-dl/heart.csv')
-
- # 使用pandas读取CSV文件
- df = pd.read_csv(csv_file)
-
- # 使用df.head()查看数据集,df.dtypes查看每列特征的数据格式
- # 将分类的特征转换成离散数值
- df['thal'] = pd.Categorical(df['thal'])
- df['thal'] = df.thal.cat.codes
-
- # 使用tf.data.Dataset.from_tensor_slices读取数据
- target = df.pop('target') # label列
- dataset = tf.data.Dataset.from_tensor_slices((df.values, target.values))
-
- # shuffle and batch
- train_dataset = dataset.shuffle(len(df)).batch(1)
- def get_compiled_model():
- model = tf.keras.Sequential([
- tf.keras.layers.Dense(10, activation='relu'),
- tf.keras.layers.Dense(10, activation='relu'),
- tf.keras.layers.Dense(1, activation='sigmoid')
- ])
-
- model.compile(optimizer='adam',
- loss='binary_crossentropy',
- metrics=['accuracy'])
- return model
-
- model = get_compiled_model()
- model.fit(train_dataset, epochs=15)
先跳过
使用tf.data.TextLineDataset加载文本文件,通常被用来以文本文件构建数据集(原文件中的一行为一个样本) 。使用相同作品(荷马的伊利亚特)三个不同版本的英文翻译,然后训练一个模型来通过单行文本确定译者。
- import tensorflow as tf
-
- import tensorflow_datasets as tfds
- import os
- DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
- FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']
-
- for name in FILE_NAMES:
- text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
-
- parent_dir = os.path.dirname(text_dir)
-
- parent_dir
- # '/home/kbuilder/.keras/datasets'
每个样本都需要单独标记,使用 tf.data.Dataset.map 来为每个样本设定标签。这将迭代数据集中的每一个样本并且返回( example, label )对。
- def labeler(example, index):
- return example, tf.cast(index, tf.int64)
-
- labeled_data_sets = []
-
- for i, file_name in enumerate(FILE_NAMES):
- lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
- labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
- labeled_data_sets.append(labeled_dataset)
构建数据集:
- BUFFER_SIZE = 50000
- BATCH_SIZE = 64
- TAKE_SIZE = 5000
-
- all_labeled_data = labeled_data_sets[0]
- for labeled_dataset in labeled_data_sets[1:]:
- all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
-
- all_labeled_data = all_labeled_data.shuffle(
- BUFFER_SIZE, reshuffle_each_iteration=False)
-
- # 使用 tf.data.Dataset.take 与 print 来查看 (example, label) 对的外观
- # numpy 属性显示每个 Tensor 的值
- for ex in all_labeled_data.take(5):
- print(ex)
-
- '''
- (<tf.Tensor: shape=(), dtype=string, numpy=b'To Ida; in his presence once arrived,'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
- (<tf.Tensor: shape=(), dtype=string, numpy=b"Such now appears th' o'er-ruling sov'reign will">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
- (<tf.Tensor: shape=(), dtype=string, numpy=b'Them so prepared the King of men beheld'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
- (<tf.Tensor: shape=(), dtype=string, numpy=b'mourn you, but the eddies of Scamander shall bear you into the broad'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
- (<tf.Tensor: shape=(), dtype=string, numpy=b'there was no life left in him.'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
- '''
将文本编码成数字:
- # 1. 建立词汇表
- # 1.1 迭代每个样本的 numpy 值
- # 1.2 使用 tfds.features.text.Tokenizer 来将其分割成 token
- tokenizer = tfds.features.text.Tokenizer()
-
- # 1.3 将 token 放入一个python集合中消除重复项
- vocabulary_set = set()
- for text_tensor, _ in all_labeled_data:
- some_tokens = tokenizer.tokenize(text_tensor.numpy())
- vocabulary_set.update(some_tokens)
-
- # 1.4 获取该词汇表的大小
- vocab_size = len(vocabulary_set)
- vocab_size
- # 17178
-
- # 2. 样本编码
- # 向编码器的encode方法传入一行文本后返回一个整数列表
- # 2.1 构建编码器
- encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
-
- # 2.2 运行编码器
- # 将编码器打包到 tf.py_function 并且传参至数据集的 map 方法在数据集上运行编码器
- # tf.py_function 将计算图表示为 python 函数
- # tf.py_function(func,inp,Tout,name=None),func是一个python函数,接受inp作为参数返回一个Tout类型的输出
- def encode(text_tensor, label):
- encoded_text = encoder.encode(text_tensor.numpy())
- return encoded_text, label
-
- def encode_map_fn(text, label):
- encoded_text, label = tf.py_function(encode,
- inp=[text, label],
- Tout=(tf.int64, tf.int64))
-
- encoded_text.set_shape([None])
- label.set_shape([])
-
- return encoded_text, label
-
-
- all_encoded_data = all_labeled_data.map(encode_map_fn)
将数据集分割成测试集和训练集并分batch:
使用 tf.data.Dataset.take 和 tf.data.Dataset.skip 来建立一个小一些的测试数据集和稍大一些的训练数据集。在数据集被传入模型之前,数据集需要被分批。每个分支中的样本大小与格式需要一致,但是数据集中样本并不全是相同大小的(每行文本字数并不相同)。因此,使用 tf.data.Dataset.padded_batch(而不是 batch )将样本填充到相同的大小。
- train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
- train_data = train_data.padded_batch(BATCH_SIZE)
-
- test_data = all_encoded_data.take(TAKE_SIZE)
- test_data = test_data.padded_batch(BATCH_SIZE)
-
- # 查看测试集
- sample_text, sample_labels = next(iter(test_data))
-
- sample_text[0], sample_labels[0]
-
- '''
- (<tf.Tensor: shape=(16,), dtype=int64, numpy=
- array([15746, 11433, 8394, 9006, 379, 3463, 17072, 0, 0,
- 0, 0, 0, 0, 0, 0, 0])>,
- <tf.Tensor: shape=(), dtype=int64, numpy=0>)
- '''
-
- # 现在,test_data 和 train_data 不是(example, label)对的集合,而是批次的集合
- # 每个批次都是一对(多样本, 多标签),表示为数组
-
- # 由于引入了一个新的 token 来编码,因此词汇表大小增加了一个
- vocab_size += 1
- model = tf.keras.Sequential()
-
- # Embedding层将整数表示转换为密集矢量嵌入
- model.add(tf.keras.layers.Embedding(vocab_size, 64))
-
- # LSTM 层允许模型利用上下文中理解单词含义
- model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
-
- # 一个或多个紧密连接的层
- # 编辑 `for` 行的列表去检测层的大小
- for units in [64, 64]:
- model.add(tf.keras.layers.Dense(units, activation='relu'))
-
- # 输出层,第一个参数是标签个数。
- model.add(tf.keras.layers.Dense(3, activation='softmax'))
-
- # 编译模型
- model.compile(optimizer='adam',
- loss='sparse_categorical_crossentropy',
- metrics=['accuracy'])
- model.fit(train_data, epochs=3, validation_data=test_data)
-
- eval_loss, eval_acc = model.evaluate(test_data)
-
- print('\nEval loss: {}, Eval accuracy: {}'.format(eval_loss, eval_acc))
-
- '''
- 79/79 [==============================] - 1s 18ms/step - loss: 0.3794 - accuracy: 0.8246
- Eval loss: 0.3794495761394501, Eval accuracy: 0.8245999813079834
- '''
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。