#coding: utf-8import sysfrom collections import Counterimport numpy as npimport tensorflow.contrib.keras as krif sys.version_info[0] > 2:    is_py3 = Trueelse:    reload(sys)    sys.setdefaultencoding("utf-8")    is_py3 = False    # 判断软件的版本,如果版本为3.6.5,那么sys.version_info的输出为:sys.version_info(major=3, minor=6, micro=5)。"""如果在python2下面使用python3训练的模型,可考虑调用此函数转化一下字符编码"""def native_word(word, encoding='utf-8'):    if not is_py3:        return word.encode(encoding)    else:        return word"""is_py3函数当版本为3时返回True,否则返回False。if not 后面的值为False则将“utf-8”编码转换为'unicode'."""def native_content(content):    if not is_py3:        return content.decode('utf-8')    else:        return content""" 常用文件操作,可在python2和python3间切换."""def open_file(filename, mode='r'):    if is_py3:        return open(filename, mode, encoding='utf-8', errors='ignore')    else:        return open(filename, mode)""" 读取文件数据"""def read_file(filename):     contents, labels = [], []    with open_file(filename) as f:        for line in f:            try:                   label, content = line.strip().split('\t')                if content:                    contents.append(list(native_content(content)))                    labels.append(native_content(label))            except:                pass    return contents, labels      #  line.strip().split('\t')的输出为两个元素的列表:['体育', '黄蜂vs湖人首发:科比带伤战保罗 加索尔救赎之战 新浪体育讯...']。      # 注意这个list()函数,把一段文字转化为了列表,元素为每个字和符号:['黄', '蜂', 'v', 's', '湖', '人', '首', '发', ':', '科', '比',...]      # contents的元素为每段新闻转化成的列表:[['黄', '蜂', 'v', 's', '湖', '人', '首', '发', ':', '科', '比',...],[],...]      # labels为['体育', '体育',...]"""根据训练集构建词汇表,存储"""def build_vocab(train_dir, vocab_dir, vocab_size=5000):     data_train, _ = read_file(train_dir)    all_data = []    for content in data_train:        all_data.extend(content)    counter = Counter(all_data)    count_pairs = counter.most_common(vocab_size - 1)    words, _ = list(zip(*count_pairs))    words = ['
'] + list(words) open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')'''读取词汇表'''def read_vocab(vocab_dir): with open_file(vocab_dir) as fp: words = [native_content(_.strip()) for _ in fp.readlines()] word_to_id = dict(zip(words, range(len(words)))) return words, word_to_id# readlines()读取所有行然后把它们作为一个字符串列表返回:['头\n', '天\n', ...]。strip()函数去掉"\n"。# words: ['
', ',', '的', '。', '一', '是', '在', '0', '有',...]# word_to_id:{'
': 0, ',': 1, '的': 2, '。': 3, '一': 4, '是': 5,..},每个类别对应的value值为其索引ID"""读取分类目录"""def read_category(): categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐'] categories = [native_content(x) for x in categories] cat_to_id = dict(zip(categories, range(len(categories)))) return categories, cat_to_id # cat_to_id的输出为:{'体育': 0, '财经': 1, '房产': 2, '家居': 3,...},每个类别对应的value值为其索引ID. """ 将id表示的内容转换为文字 """def to_words(content, words): return ''.join(words[x] for x in content)""" 将文件转换为id表示,进行pad """def process_file(filename, word_to_id, cat_to_id, max_length=600): contents, labels = read_file(filename) data_id, label_id = [], [] #contents的形式为:[['黄', '蜂', 'v', 's', '湖', '人',...],[],[],...],每一个元素是一个列表,该列表的元素是每段新闻的字和符号。 #labels的形式为:['体育', '体育', '体育', '体育', '体育', ...] for i in range(len(contents)): data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id]) label_id.append(cat_to_id[labels[i]]) x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length) y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) return x_pad, y_pad # word_to_id是一个字典:{'
': 0, ',': 1, '的': 2, '。': 3, '一': 4, '是': 5,...} # 对于每一段新闻转化的字列表,把每个字在字典中对应的索引找到: # data_id: 将[['黄', '蜂', 'v', 's', '湖', '人',...],[],[],...] 转化为 [[387, 1197, 2173, 215, 110, 264,...],[],[],...]的形式 # label_id : ['体育', '体育', '体育', '体育', '体育', ...] 转化为[0, 0, 0, 0, 0, ...] # data_id的行数为50000,即为新闻的条数,每个元素为由每段新闻的字的数字索引构成的列表; # data_id长这样:[[387, 1197, 2173, 215, 110, 264,...],[],[],...] # 由于每段新闻的字数不一样,因此每个元素(列表)的长度不一样,可能大于600,也可能小于600,需要统一长度为600。 # 使用keras提供的pad_sequences来将文本pad为固定长度,x_pad的形状为(50000,600). # label_id是形如[0, 0, 0, 0, 0, ...]的整形数组,cat_to_id是形如{'体育': 0, '财经': 1, '房产': 2, '家居': 3,...}的字典 # to_categorical是对标签进行one-hot编码,num-classes是类别数10,y_pad的维度是(50000,10) """生成批次数据"""def batch_iter(x, y, batch_size=64): data_len = len(x) num_batch = int((data_len - 1) / batch_size) + 1 indices = np.random.permutation(np.arange(data_len)) x_shuffle = x[indices] y_shuffle = y[indices] # 样本长度为50000 # int()可以将其他类型转化为整型,也可以用于向下取整,这里为782. # indices元素的范围是0-49999,形如[256,189,2,...]的拥有50000个元素的列表 # 用indices对样本和标签按照行进行重新洗牌,接着上面的例子,把第256行(从0开始计)放在第0行,第189行放在第1行. for i in range(num_batch): start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id] # i=780时,end_id=781*64=49984; # 当i=781时,end_id=50000,因为782*64=50048>50000,所以最后一批取[49984:50000] # yield是生成一个迭代器,用for循环来不断生成下一个批量。 # 为了防止内存溢出,每次只取64个,内存占用少。




#!/usr/bin/python# -*- coding: utf-8 -*-import tensorflow as tfclass TRNNConfig(object):    """RNN配置参数"""    embedding_dim = 64         seq_length = 600           num_classes = 10            vocab_size = 5000           num_layers= 2               hidden_dim = 128           rnn = 'gru'         # 隐藏层层数为2    # 选择lstm 或 gru    dropout_keep_prob = 0.8     learning_rate = 1e-3       batch_size = 128             num_epochs = 10            print_per_batch = 100        save_per_batch = 10      class TextRNN(object):    """文本分类,RNN模型"""    def __init__(self, config):        self.config = config        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')        self.rnn()    def rnn(self):        """rnn模型"""        def lstm_cell():               return tf.nn.rnn_cell.LSTMCell(self.config.hidden_dim, state_is_tuple=True)        def gru_cell():              return tf.nn.rnn_cell.GRUCell(self.config.hidden_dim)        def dropout():             if (self.config.rnn == 'lstm'):                cell = lstm_cell()            else:                cell = gru_cell()            return tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=self.keep_prob)        # 为每一个rnn核后面加一个dropout层        with tf.device('/gpu:0'):            embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)        with tf.name_scope("rnn"):            cells = [dropout() for _ in range(self.config.num_layers)]            rnn_cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)            # 堆叠了2层的RNN模型。            _outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)            last = _outputs[:, -1, :]              # 取最后一个时序输出作为结果,也就是最后时刻和第2层的LSTM或GRU的隐状态。        with tf.name_scope("score"):            # 全连接层,后面接dropout以及relu激活            fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')            fc = tf.contrib.layers.dropout(fc, self.keep_prob)            fc = tf.nn.relu(fc)            self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)          with tf.name_scope("optimize"):                        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)            self.loss = tf.reduce_mean(cross_entropy)                       self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)        with tf.name_scope("accuracy"):                correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))




# coding: utf-8from __future__ import print_functionimport osimport sysimport timefrom datetime import timedeltaimport numpy as npimport tensorflow as tffrom sklearn import metricsfrom rnn_model import TRNNConfig, TextRNNfrom cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocabbase_dir = 'data/cnews'train_dir = os.path.join(base_dir, 'cnews.train.txt')test_dir = os.path.join(base_dir, 'cnews.test.txt')val_dir = os.path.join(base_dir, 'cnews.val.txt')vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')save_dir = 'checkpoints/textrnn'save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径def get_time_dif(start_time):    """获取已使用时间"""    end_time = time.time()    time_dif = end_time - start_time    return timedelta(seconds=int(round(time_dif)))def feed_data(x_batch, y_batch, keep_prob):    feed_dict = {        model.input_x: x_batch,        model.input_y: y_batch,        model.keep_prob: keep_prob    }    return feed_dictdef evaluate(sess, x_, y_):    """评估在某一数据上的准确率和损失"""    data_len = len(x_)    batch_eval = batch_iter(x_, y_, 128)    total_loss = 0.0    total_acc = 0.0    for x_batch, y_batch in batch_eval:        batch_len = len(x_batch)        feed_dict = feed_data(x_batch, y_batch, 1.0)        # 在测试时不用进行dropout        y_pred_class,loss, acc = sess.run([model.y_pred_cls,model.loss, model.acc], feed_dict=feed_dict)        total_loss += loss * batch_len        total_acc += acc * batch_len    return y_pred_class,total_loss / data_len, total_acc / data_lendef train():    print("Configuring TensorBoard and Saver...")    tensorboard_dir = 'tensorboard/textrnn'    if not os.path.exists(tensorboard_dir):        os.makedirs(tensorboard_dir)    tf.summary.scalar("loss", model.loss)    tf.summary.scalar("accuracy", model.acc)    merged_summary = tf.summary.merge_all()    writer = tf.summary.FileWriter(tensorboard_dir)    # 配置 Saver    saver = tf.train.Saver()    if not os.path.exists(save_dir):        os.makedirs(save_dir)    print("Loading training and validation data...")    # 载入训练集与验证集    start_time = time.time()    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)    time_dif = get_time_dif(start_time)    print("Time usage:", time_dif)    # 创建session    session = tf.Session()    session.run(tf.global_variables_initializer())    writer.add_graph(session.graph)    print('Training and evaluating...')    start_time = time.time()    total_batch = 0      best_acc_val = 0.0      last_improved = 0      require_improvement = 1000      # 如果超过1000轮未提升,提前结束训练    flag = False    for epoch in range(config.num_epochs):        print('Epoch:', epoch + 1)        batch_train = batch_iter(x_train, y_train, config.batch_size)        for x_batch, y_batch in batch_train:            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)            if total_batch % config.save_per_batch == 0:                s = session.run(merged_summary, feed_dict=feed_dict)                writer.add_summary(s, total_batch)            if total_batch % config.print_per_batch == 0:                                feed_dict[model.keep_prob] = 1.0                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)                y_pred_cls_1,loss_val, acc_val = evaluate(session, x_val, y_val)  # todo                if acc_val > best_acc_val:                    # 保存最好结果                    best_acc_val = acc_val                    last_improved = total_batch                    saver.save(sess=session, save_path=save_path)                    improved_str = '*'                else:                    improved_str = ''                time_dif = get_time_dif(start_time)                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))            session.run(model.optim, feed_dict=feed_dict)  # 运行优化            total_batch += 1            if total_batch - last_improved > require_improvement:                # 验证集正确率长期不提升,提前结束训练                print("No optimization for a long time, auto-stopping...")                flag = True                break          if flag:              breakdef test():    print("Loading test data...")    start_time = time.time()    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)    session = tf.Session()    session.run(tf.global_variables_initializer())    saver = tf.train.Saver()    saver.restore(sess=session, save_path=save_path)     # 读取保存的模型    print('Testing...')    y_pred,loss_test, acc_test = evaluate(session, x_test, y_test)    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'    print(msg.format(loss_test, acc_test))    batch_size = 128    data_len = len(x_test)    num_batch = int((data_len - 1) / batch_size) + 1    y_test_cls = np.argmax(y_test, 1)    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)      for i in range(num_batch):          start_id = i * batch_size        end_id = min((i + 1) * batch_size, data_len)        feed_dict = {            model.input_x: x_test[start_id:end_id],            model.keep_prob: 1.0        }        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)    # 评估    print("Precision, Recall and F1-Score...")    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))    # 混淆矩阵    print("Confusion Matrix...")    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)    print(cm)    time_dif = get_time_dif(start_time)    print("Time usage:", time_dif)if __name__ == '__main__':    print('Configuring RNN model...')    config = TRNNConfig()        if not os.path.exists(vocab_dir):          build_vocab(train_dir, vocab_dir, config.vocab_size)    categories, cat_to_id = read_category()    words, word_to_id = read_vocab(vocab_dir)    config.vocab_size = len(words)    model = TextRNN(config)    option='train'        if option == 'train':        train()    else:        test()



Iter:   3500, Train Loss:  0.034, Train Acc:  98.44%, Val Loss:   0.35, Val Acc:  91.54%, Time: 0:46:47 * Testing... Test Loss:    0.2, Test Acc:  94.67% Precision, Recall and F1-Score...               precision    recall  f1-score   support           体育       0.99      0.99      0.99      1000           财经       0.93      0.99      0.96      1000           房产       1.00      1.00      1.00      1000           家居       0.95      0.83      0.89      1000           教育       0.88      0.93      0.90      1000           科技       0.95      0.96      0.95      1000           时尚       0.95      0.95      0.95      1000           时政       0.95      0.91      0.93      1000           游戏       0.94      0.96      0.95      1000           娱乐       0.94      0.96      0.95      1000    micro avg       0.95      0.95      0.95     10000    macro avg       0.95      0.95      0.95     10000 weighted avg       0.95      0.95      0.95     10000 Confusion Matrix... [[990   0   0   0   5   1   0   0   4   0]  [  0 987   1   0   2   3   0   6   1   0]  [  0   0 996   2   2   0   0   0   0   0]  [  0  22   2 834  60  20  25  20  10   7]  [  1   6   0   6 925   7   5  12   4  34]  [  0   5   0   8   8 959   2   2  16   0]  [  0   0   0  13   9   2 948   4  12  12]  [  0  33   1  15  21  11   1 910   4   4]  [  1   1   0   2  10   5  11   0 962   8]  [  4   2   0   1  15   3   5   2  12 956]] Time usage: 0:00:40




# coding: utf-8from __future__ import print_functionimport osimport tensorflow as tfimport tensorflow.contrib.keras as krfrom rnn_model import TRNNConfig, TextRNNfrom cnews_loader import read_category, read_vocabtry:    bool(type(unicode))except NameError:    unicode = strbase_dir = 'data/cnews'vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')save_dir = 'checkpoints/textrnn'save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径class RnnModel:    def __init__(self):        self.config = TRNNConfig()        self.categories, self.cat_to_id = read_category()        self.words, self.word_to_id = read_vocab(vocab_dir)        self.config.vocab_size = len(self.words)        self.model = TextRNN(self.config)        self.session = tf.Session()        self.session.run(tf.global_variables_initializer())        saver = tf.train.Saver()        saver.restore(sess=self.session, save_path=save_path)          # 读取保存的模型    def predict(self, message):        content = unicode(message)        data = [self.word_to_id[x] for x in content if x in self.word_to_id]        feed_dict = {            self.model.input_x: kr.preprocessing.sequence.pad_sequences([data], self.config.seq_length),            self.model.keep_prob: 1.0        }        y_pred_cls = self.session.run(self.model.y_pred_cls, feed_dict=feed_dict)        return self.categories[y_pred_cls[0]]if __name__ == '__main__':    rnn_model = RnnModel()    test_demo = ['三星ST550以全新的拍摄方式超越了以往任何一款数码相机',                 '热火vs骑士前瞻:皇帝回乡二番战 东部次席唾手可得新浪体育讯北京时间3月30日7:00']    for i in test_demo:        print(rnn_model.predict(i))



