# Overview

• ## LSTM

LSTM在之前的文章中介绍过：Tensorflow[基础篇]——lstm的理解与实现，LSTM是一个处理序列的深度学习网络，他与一般RNN不同在于LSTM适合于处理和预测时间序列中间隔和延迟非常长的重要事件。也就是说与预测词汇很远的相关上下文也可以记忆下来并作为预测的重要依据。假如想深入了解可以点开我的文章去看看实现LSTM的公式，论文和模型实现细节：Tensorflow[基础篇]——lstm的理解与实现，这里我就不多说了。

• ## IDEA

“[”->“大”，“大”->“漠”，“漠”->“孤”，“孤”->“烟”，“烟”->“直”，“直”->“]”，“]”->“]”，这样子先后顺序一一对相。这也是RNN的一个重要的特征。

LSTM模型我们使用tensorflow给的`tf.nn.rnn_cell.BasicLSTMCell`生成LSTM基本模型，当然你也可以使用其他LSTM模型的变种（GRU等）。最后使用`sequence_loss_by_example`得到损失函数作为训练目标。

# Detail

``````# -*-coding:utf-8-*-#
import collections

def get_poetrys(poetry_file="./poetry.txt"):
poetrys = []
with open(poetry_file, 'r', encoding='utf-8') as f:
for index, line in enumerate(f):
try:
title, content = line.strip().split(":")
content = content.replace(" ", "")
if '_' in content or '(' in content or \
'（' in content or '《' in content or \
'[' in content :
continue
if len(content) < 5 or len(content) > 79:
continue
content = '[' + content + ']'
poetrys.append(content)
except Exception as e:
pass
return poetrys

def build_dataset():
poetrys = get_poetrys()
poetrys = sorted(poetrys, key=lambda line: len(line))
print("唐诗总数:", len(poetrys))
words = []
for poetry in poetrys:
words += [word for word in poetry]
counter = collections.Counter(words)
# 从大到小排序
counter_pairs = sorted(counter.items(), key=lambda x: -x[1])
# 从counter中解压，并获取当中的词(不重复)
words, _ = zip(*counter_pairs)
words = words[:len(words)] + (" ", )
# word -> id
dictionary = dict(zip(words, range(len(words))))
# id -> word
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

poetry_vectors = [[dictionary[word] for word in poetry] for poetry in poetrys]
return dictionary, poetry_vectors, reversed_dictionary
``````

## 2. BatchGenerator.py

``````# -*-coding:utf-8-*-#
import numpy as np

class BatchGenerator(object):
"""docstring for BatchGenerator"""
def __init__(self, data, batch_size, empty_key):
self._batch_size = batch_size
self._offset = 0
self._batch = []
self._data_size = len(data)
self._batch_num = self._data_size // self._batch_size
self._data = data
self._generate_batch(empty_key)

def _generate_batch(self, empty_key):
for index in range(self._batch_num):
start = index * self._batch_size
end = start + self._batch_size
# 当前batch中诗词的最大长度
length = max(map(len, self._data[start: end]))
# 创建batch数据，假如有诗词没有达到最大长度使用空格作为补充
batch_data = np.full((self._batch_size, length), empty_key, np.int32)
for row in range(self._batch_size):
poetry = self._data[start + row]
batch_data[row, :len(poetry)] = poetry
self._batch.append(batch_data)

def next(self):
x_data = self._batch[self._offset]
y_data = np.copy(x_data)
y_data[:, : -1] = x_data[:, 1: ]
self._offset = (self._offset + 1) % self._batch_num
return x_data, y_data
``````

## 3. GeneratePoetryModel.py

``````# -*-coding:utf-8-*-#
import tensorflow as tf

class GeneratePoetryModel(object):
"""docstring for GeneratePoetryModel"""
def __init__(self, X, batch_size, input_size, output_size, model='lstm', rnn_size=128, num_layers=2):
self._model = model
self._num_unit = rnn_size   # LSTM的单元个数
self._num_layers = num_layers # LSTM的层数
self._input_size = input_size # 最后全连接层输入维数
self._output_size = output_size # 最后全连接层输出维数
self._model_layers = self._get_layer() # 获得模型的LSTM隐含层

self._initial_state = self._model_layers.zero_state(batch_size, tf.float32) # 定义初始状态

with tf.variable_scope('rnnlm'):
n = (self._num_unit + self._output_size) * 0.5
scale = tf.sqrt(3 / n)
# 全连接层的参数定义
softmax_w = tf.get_variable(
"softmax_w",
[self._num_unit, self._output_size],
initializer=tf.random_uniform_initializer(-scale, scale))
softmax_b = tf.get_variable(
"softmax_b",
[self._output_size],
initializer=tf.random_uniform_initializer(-scale, scale))
with tf.device("/cpu:0"):
embedding = tf.get_variable("embedding", [self._input_size, self._num_unit])
inputs = tf.nn.embedding_lookup(embedding, X)

# 运行隐含层LSTM
outputs, last_state = tf.nn.dynamic_rnn(self._model_layers, inputs, initial_state=self._initial_state, scope="rnnlm")
self._outputs = tf.reshape(outputs, [-1, self._num_unit])
self._last_state = last_state
# 得到全连接层结果
self._logists = tf.matmul(self._outputs, softmax_w) + softmax_b
# 得到预测结果
self._probs = tf.nn.softmax(self._logists)

def _get_cell(self):
if self._model == 'rnn':
cell_fun = tf.nn.rnn_cell.BasicRNNCell
elif self._model == 'gru':
cell_fun = tf.nn.rnn_cell.GRUCell
elif self._model == 'lstm':
cell_fun = tf.nn.rnn_cell.BasicLSTMCell

return cell_fun(self._num_unit, state_is_tuple=True)

def _get_layer(self):
cell = self._get_cell()
return tf.nn.rnn_cell.MultiRNNCell([cell] * self._num_layers, state_is_tuple=True)

def results(self):
"""
输出神经网络的结果和需要的参数
"""
return self._logists, self._last_state, self._probs, self._initial_state

``````

## 4. learning_poetry.py

``````# -*-coding:utf-8-*-#
import datetime
import tensorflow as tf

import os
import sys

from BatchGenerator import BatchGenerator
from GeneratePoetryModel import GeneratePoetryModel

empty_key = dictionary.get(' ')

batch_size =64

batch_generator = BatchGenerator(poetry_vectors, batch_size, empty_key)

# x_data, y_data = batch_generator.next()

input_size = output_size = len(dictionary) + 1

train_data = tf.placeholder(tf.int32, [batch_size, None])
train_label = tf.placeholder(tf.int32, [batch_size, None])

model = GeneratePoetryModel(X=train_data, batch_size=batch_size, input_size=input_size, output_size=output_size)

logists, last_state, _, _ = model.results()
targets = tf.reshape(train_label, [-1])
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logists], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(dictionary))
cost = tf.reduce_mean(loss)

global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(0.01, global_step, batch_generator._batch_num, 0.9, staircase=True)

with tf.Session() as session:
session.run(tf.global_variables_initializer())

saver = tf.train.Saver(tf.global_variables())
print("training...")
model_dir = "./model/"
if not os.path.exists(model_dir):
os.mkdir(model_dir)
print("create the directory: %s" % model_dir)
# 损失值最小的回合
best_cost_epoch = 0
# 损失最小值
best_cost = float('Inf')
start_time = datetime.datetime.now()
for epoch in range(141):
epoch_start_time = datetime.datetime.now()
epoch_mean_cost = 0
for batch in range(batch_generator._batch_num):
x_data, y_data = batch_generator.next()
_, _, c, lr, gs = session.run(
[optimizer, last_state, cost, learning_rate, global_step],
feed_dict={train_data: x_data, train_label: y_data})
epoch_mean_cost += c
print("current epoch %d, current batch is %d, mean cost : %2.8f, learning rate: %2.8f, global step : %d"
%(epoch, batch, c, lr, gs))
epoch_mean_cost = epoch_mean_cost / batch_generator._batch_num
print("="*80)
if epoch != 0:
print("\nthe best cost : %2.8f, the best epoch index : %d, current epoch cost : %2.8f. \n" \
%(best_cost, best_cost_epoch, epoch_mean_cost))
if best_cost > epoch_mean_cost:
print("the best epoch will change from %d to %d" %(best_cost_epoch, epoch))
best_cost = epoch_mean_cost
best_cost_epoch = epoch
saver.save(session, model_dir + 'poetry.module-best')
if epoch % 7 == 0:
saver.save(session, model_dir + 'poetry.module', global_step=epoch)
end_time = datetime.datetime.now()
timedelta = end_time - epoch_start_time
print("the epoch training spends %d days, %d hours, %d minutes, %d seconds.\n" \
%(timedelta.days, timedelta.seconds // 3600, timedelta.seconds // 60, timedelta.seconds % 60))
print("="*80)
print("\n")
timedelta = end_time - start_time
print("*"*80)
print("\nThe training spends %d days, %d hours, %d minutes, %d seconds" \
%(timedelta.days, timedelta.seconds // 3600, timedelta.seconds // 60, timedelta.seconds % 60))

``````

## 5. create_poetry.py

``````# -*-coding:utf-8-*-#
import numpy as np
import tensorflow as tf

from GeneratePoetryModel import GeneratePoetryModel

def to_word(weights):
"""
通过传入的权重，计算向量的概率分布并通过随机采样获得最接近的词语，
类似遗传算法的选择步骤。（个人认为不够严谨）
"""
t = np.cumsum(weights)
s = np.sum(weights)
sample = int(np.searchsorted(t, np.random.rand(1) * s))
return reversed_dictionary[sample]

# 定义输入的只有一个字词，然后根据上一个字词推测下一个词的位置
input_data = tf.placeholder(tf.int32, [1, None])
# 输入和输出的尺寸为1
input_size = output_size = len(reversed_dictionary) + 1
# 定义模型
model = GeneratePoetryModel(X=input_data, batch_size=1, input_size=input_size, output_size=output_size)
# 获取模型的输出参数
_, last_state, probs, initial_state = model.results()

with tf.Session() as session:
session.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
print("generate...")
saver.restore(session, './model/poetry.module-140')
# 起始字符是'['，
x = np.array([list(map(dictionary.get, '['))])
# 运行初始0状态
state_ = session.run(initial_state)
word = poem = '['
# 结束字符是']'
while word != ']':
# 使用上一级的state和output作为输入
probs_, state_ = session.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
word = to_word(probs_)
poem += word
# 获取词语的id
x = np.zeros((1, 1))
x[0, 0] = dictionary[word]
print(poem)

``````

# Experimental Results

• [者外临江冷，门谈上水空。远人长忆欲，月色杳冠年。招海幂明匆，星辕藏似金。声凉草不醒，窗静塘称琴。]

者外临江冷，门谈上水空。远人长忆欲，月色杳冠年。招海幂明匆，星辕藏似金。声凉草不醒，窗静塘称琴。
• [玉低回上道，鹤发不离群。张屋犹飞绕，兼言潜草边。浮舟仍已尽，会数种如群。王子怜桃李，闲空昔浦书。无同在天韶，何处似戎衣。]

玉低回上道，鹤发不离群。张屋犹飞绕，兼言潜草边。浮舟仍已尽，会数种如群。王子怜桃李，闲空昔浦书。无同在天韶，何处似戎衣。
• [水平西望使人苏，此地何曾肯相思。独念千端饮在器，路来闲墨网来难。一行分管潮中去，朝更还似盘抹热。愁闻几醉遗名处，长短逢离火上风。]

水平西望使人苏，此地何曾肯相思。独念千端饮在器，路来闲墨网来难。一行分管潮中去，朝更还似盘抹热。愁闻几醉遗名处，长短逢离火上风。

# Conclusion

• 在生成诗词的时候，会陷入一个很长的生成过程，因为一直都没有生成结束符']'。这代表还没学得很彻底。
• 我个人认为这个还不完整，因为损失还是降到一定程度就没有再下降，不知道是否达到全局最小。
• 训练的所用的embedding，我这里使用的embedding是随机生产的，但我觉得可以使用word2vec会更好。
• 对于唐诗我觉得对仗程度应该有一定要求，所以使用attention模型结合或许会提高一定的对仗程度。
• 这些诗词都是比较无意义的，假如可以通过主题生成，或许展示效果会更好。