TensorFlow 1.13.2

更新时间：2023-01-18

TensorFlow

基于tensorflow框架的MNIST图像分类任务示例代码，训练数据集点击这里下载

单机训练（计算节点数为1），示例代码如下：

import os
import tensorflow as tf
import numpy as np
from tensorflow import keras
layers = tf.layers
tf.logging.set_verbosity(tf.logging.INFO)
def conv_model(feature, target, mode):
 """2-layer convolution model."""
 # Convert the target to a one-hot tensor of shape (batch_size, 10) and
 # with a on-value of 1 for each one-hot vector of length 10.
 target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
 # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
 # image width and height final dimension being the number of color channels.
 feature = tf.reshape(feature, [-1, 28, 28, 1])
 # First conv layer will compute 32 features for each 5x5 patch
 with tf.variable_scope('conv_layer1'):
     h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
                             activation=tf.nn.relu, padding="SAME")
     h_pool1 = tf.nn.max_pool(
         h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 # Second conv layer will compute 64 features for each 5x5 patch.
 with tf.variable_scope('conv_layer2'):
     h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
                             activation=tf.nn.relu, padding="SAME")
     h_pool2 = tf.nn.max_pool(
         h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
     # reshape tensor into a batch of vectors
     h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
 # Densely connected layer with 1024 neurons.
 h_fc1 = layers.dropout(
     layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
     rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
 # Compute logits (1 per class) and compute loss.
 logits = layers.dense(h_fc1, 10, activation=None)
 loss = tf.losses.softmax_cross_entropy(target, logits)
 return tf.argmax(logits, 1), loss
def train_input_generator(x_train, y_train, batch_size=64):
 assert len(x_train) == len(y_train)
 while True:
     p = np.random.permutation(len(x_train))
     x_train, y_train = x_train[p], y_train[p]
     index = 0
     while index <= len(x_train) - batch_size:
         yield x_train[index:index + batch_size], \
               y_train[index:index + batch_size],
         index += batch_size
def main(_):
 work_path = os.getcwd()
 # Download and load MNIST dataset.
 (x_train, y_train), (x_test, y_test) = \
     keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path)
 # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
 # into (-1, 784) to feed into our network. Also, need to normalize the
 # features between 0 and 1.
 x_train = np.reshape(x_train, (-1, 784)) / 255.0
 x_test = np.reshape(x_test, (-1, 784)) / 255.0
 # Build model...
 with tf.name_scope('input'):
     image = tf.placeholder(tf.float32, [None, 784], name='image')
     label = tf.placeholder(tf.float32, [None], name='label')
 predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
 opt = tf.train.RMSPropOptimizer(0.001)
 global_step = tf.train.get_or_create_global_step()
 train_op = opt.minimize(loss, global_step=global_step)
 hooks = [
     tf.train.StopAtStepHook(last_step=20000),
     tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                every_n_iter=10),
 ]
 # Horovod: pin GPU to be used to process local rank (one GPU per process)
 config = tf.ConfigProto()
 config.gpu_options.allow_growth = True
 config.gpu_options.visible_device_list = '0'
 # Horovod: save checkpoints only on worker 0 to prevent other workers from
 # corrupting them.
 checkpoint_dir = './checkpoints'
 training_batch_generator = train_input_generator(x_train,
                                                  y_train, batch_size=100)
 # The MonitoredTrainingSession takes care of session initialization,
 # restoring from a checkpoint, saving to a checkpoint, and closing when done
 # or an error occurs.
 with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                        hooks=hooks,
                                        config=config) as mon_sess:
     while not mon_sess.should_stop():
         # Run a training step synchronously.
         image_, label_ = next(training_batch_generator)
         mon_sess.run(train_op, feed_dict={image: image_, label: label_})
 checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
 saver = tf.train.Saver()
 inputs_classes = tf.saved_model.utils.build_tensor_info(image)
 outputs_classes = tf.saved_model.utils.build_tensor_info(predict)
 signature = (tf.saved_model.signature_def_utils.build_signature_def(
     inputs={tf.saved_model.signature_constants.CLASSIFY_INPUTS: inputs_classes},
     outputs={tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: outputs_classes},
     method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))
 os.system("rm -rf ./output")
 with tf.Session() as sess:
    sess.run([tf.local_variables_initializer(), tf.tables_initializer()])
    saver.restore(sess, checkpoint_file)
    builder = tf.saved_model.builder.SavedModelBuilder('./output')
    legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
    builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_images': signature}, legacy_init_op=legacy_init_op)
    builder.save()
if __name__ == "__main__":
 tf.app.run()

分布式训练（计算节点数大于1），示例代码如下：

说明：demo分布式程序没有做数据的分片操作，仅供参考

import os
import tensorflow as tf
import horovod.tensorflow as hvd
import numpy as np
from tensorflow import keras
layers = tf.layers
tf.logging.set_verbosity(tf.logging.INFO)
def conv_model(feature, target, mode):
    """2-layer convolution model."""
    # Convert the target to a one-hot tensor of shape (batch_size, 10) and
    # with a on-value of 1 for each one-hot vector of length 10.
    target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
    # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
    # image width and height final dimension being the number of color channels.
    feature = tf.reshape(feature, [-1, 28, 28, 1])
    # First conv layer will compute 32 features for each 5x5 patch
    with tf.variable_scope('conv_layer1'):
        h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
                                activation=tf.nn.relu, padding="SAME")
        h_pool1 = tf.nn.max_pool(
            h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
    # Second conv layer will compute 64 features for each 5x5 patch.
    with tf.variable_scope('conv_layer2'):
        h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
                                activation=tf.nn.relu, padding="SAME")
        h_pool2 = tf.nn.max_pool(
            h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        # reshape tensor into a batch of vectors
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
    # Densely connected layer with 1024 neurons.
    h_fc1 = layers.dropout(
        layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
        rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
    # Compute logits (1 per class) and compute loss.
    logits = layers.dense(h_fc1, 10, activation=None)
    loss = tf.losses.softmax_cross_entropy(target, logits)
    return tf.argmax(logits, 1), loss
def train_input_generator(x_train, y_train, batch_size=64):
    assert len(x_train) == len(y_train)
    while True:
        p = np.random.permutation(len(x_train))
        x_train, y_train = x_train[p], y_train[p]
        index = 0
        while index <= len(x_train) - batch_size:
            yield x_train[index:index + batch_size], \
                  y_train[index:index + batch_size],
            index += batch_size
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()
    work_path = os.getcwd()
    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path)
    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0
    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
    serve_graph_file = "./serve_graph.meta"
    tf.train.export_meta_graph(serve_graph_file, as_text=True)
    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())
    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)
    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)
    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),
        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=10000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]
    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train, batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
    if hvd.rank() != 0:
        return
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    tf.reset_default_graph()
    saver = tf.train.import_meta_graph(serve_graph_file)
    inputs_classes = tf.saved_model.utils.build_tensor_info(image)
    outputs_classes = tf.saved_model.utils.build_tensor_info(predict)
    signature = (tf.saved_model.signature_def_utils.build_signature_def(
        inputs={tf.saved_model.signature_constants.CLASSIFY_INPUTS: inputs_classes},
        outputs={tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: outputs_classes},
        method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))
    os.system("rm -rf ./output")
    with tf.Session() as sess:
       sess.run([tf.local_variables_initializer(), tf.tables_initializer()])
       saver.restore(sess, checkpoint_file)
       builder = tf.saved_model.builder.SavedModelBuilder('./output')
       legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
       builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_images': signature}, legacy_init_op=legacy_init_op)
       builder.save()
if __name__ == "__main__":
    tf.app.run()

发布模型

TensorFlow 2.3.0