python使用RNN实现文本分类


Posted in Python onMay 24, 2018

本文实例为大家分享了使用RNN进行文本分类,python代码实现,供大家参考,具体内容如下

1、本博客项目由来是oxford 的nlp 深度学习课程第三周作业,作业要求使用LSTM进行文本分类。和上一篇CNN文本分类类似,本此代码风格也是仿照sklearn风格,三步走形式(模型实体化,模型训练和模型预测)但因为训练时间较久不知道什么时候训练比较理想,因此在次基础上加入了继续训练的功能。

2、构造文本分类的rnn类,(保存文件为ClassifierRNN.py)

2.1 相应配置参数因为较为繁琐,不利于阅读,因此仿照tensorflow源码形式,将代码分成 网络配置参数 nn_config 和计算配置参数: calc_config,也相应声明了其对应的类:NN_config,CALC_config。

2.2 声明 ClassifierRNN类,该类的主要函数有:(init, build_inputs, build_rnns, build_loss, build_optimizer, random_batches,fit, load_model, predict_accuracy, predict),代码如下:

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import time
class NN_config(object):
 def __init__(self,num_seqs=1000,num_steps=10,num_units=128,num_classes = 8,\
    num_layers = 1,embedding_size=100,vocab_size = 10000,\
    use_embeddings=False,embedding_init=None):
  self.num_seqs = num_seqs
  self.num_steps = num_steps
  self.num_units = num_units
  self.num_classes = num_classes
  self.num_layers = num_layers
  self.vocab_size = vocab_size
  self.embedding_size = embedding_size
  self.use_embeddings = use_embeddings
  self.embedding_init = embedding_init

class CALC_config(object):
 def __init__(self,batch_size=64,num_epoches = 20,learning_rate = 1.0e-3, \
     keep_prob=0.5,show_every_steps = 10,save_every_steps=100):
  self.batch_size  = batch_size
  self.num_epoches = num_epoches
  self.learning_rate = learning_rate
  self.keep_prob  = keep_prob
  self.show_every_steps = show_every_steps
  self.save_every_steps = save_every_steps

class ClassifierRNN(object):
 def __init__(self, nn_config, calc_config):
  # assign revalent parameters
  self.num_seqs = nn_config.num_seqs
  self.num_steps = nn_config.num_steps
  self.num_units = nn_config.num_units
  self.num_layers = nn_config.num_layers
  self.num_classes = nn_config.num_classes
  self.embedding_size = nn_config.embedding_size
  self.vocab_size  = nn_config.vocab_size
  self.use_embeddings = nn_config.use_embeddings
  self.embedding_init = nn_config.embedding_init
  # assign calc ravalant values
  self.batch_size  = calc_config.batch_size
  self.num_epoches = calc_config.num_epoches
  self.learning_rate = calc_config.learning_rate
  self.train_keep_prob= calc_config.keep_prob
  self.show_every_steps = calc_config.show_every_steps
  self.save_every_steps = calc_config.save_every_steps
  # create networks models
  tf.reset_default_graph()
  self.build_inputs()
  self.build_rnns()
  self.build_loss()
  self.build_optimizer()
  self.saver = tf.train.Saver()

 def build_inputs(self):
  with tf.name_scope('inputs'):
   self.inputs = tf.placeholder(tf.int32, shape=[None,self.num_seqs],\
                name='inputs')
   self.targets = tf.placeholder(tf.int32, shape=[None, self.num_classes],\
                name='classes')
   self.keep_prob = tf.placeholder(tf.float32,name='keep_prob')
   self.embedding_ph = tf.placeholder(tf.float32, name='embedding_ph')

   if self.use_embeddings == False:
    self.embeddings = tf.Variable(tf.random_uniform([self.vocab_size,\
        self.embedding_size],-0.1,0.1),name='embedding_flase') 
    self.rnn_inputs = tf.nn.embedding_lookup(self.embeddings,self.inputs)
   else:
    embeddings = tf.Variable(tf.constant(0.0,shape=[self.vocab_size,self.embedding_size]),\
               trainable=False,name='embeddings_true')
    self.embeddings = embeddings.assign(self.embedding_ph)
    self.rnn_inputs = tf.nn.embedding_lookup(self.embeddings,self.inputs)
    print('self.rnn_inputs.shape:',self.rnn_inputs.shape)

 def build_rnns(self):
  def get_a_cell(num_units,keep_prob):
   rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units)
   drop = tf.contrib.rnn.DropoutWrapper(rnn_cell, output_keep_prob=keep_prob)
   return drop
  with tf.name_scope('rnns'):
   self.cell = tf.contrib.rnn.MultiRNNCell([get_a_cell(self.num_units,self.keep_prob) for _ in range(self.num_layers)]) 
   self.initial_state = self.cell.zero_state(self.batch_size,tf.float32)
   self.outputs, self.final_state = tf.nn.dynamic_rnn(self.cell,tf.cast(self.rnn_inputs,tf.float32),\
    initial_state = self.initial_state )
   print('rnn_outputs',self.outputs.shape)

 def build_loss(self):
  with tf.name_scope('loss'):
   self.logits = tf.contrib.layers.fully_connected(inputs = tf.reduce_mean(self.outputs, axis=1), \
           num_outputs = self.num_classes, activation_fn = None)
   print('self.logits.shape:',self.logits.shape)
   self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits,\
          labels = self.targets))
   print('self.cost.shape',self.cost.shape)
   self.predictions = self.logits
   self.correct_predictions = tf.equal(tf.argmax(self.predictions, axis=1), tf.argmax(self.targets, axis=1))
   self.accuracy = tf.reduce_mean(tf.cast(self.correct_predictions,tf.float32))
   print(self.cost.shape)
   print(self.correct_predictions.shape)

 def build_optimizer(self):
  with tf.name_scope('optimizer'):
   self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.cost)

 def random_batches(self,data,shuffle=True):
  data = np.array(data)
  data_size = len(data)
  num_batches_per_epoch = int(data_size/self.batch_size)

  #del data
  for epoch in range(self.num_epoches):
   if shuffle :
    shuffle_index = np.random.permutation(np.arange(data_size))
    shuffled_data = data[shuffle_index]
   else:
    shuffled_data = data  
   for batch_num in range(num_batches_per_epoch):
    start = batch_num * self.batch_size
    end = min(start + self.batch_size,data_size)
    yield shuffled_data[start:end] 

 def fit(self,data,restart=False):
  if restart :
   self.load_model()
  else:
   self.session = tf.Session()
   self.session.run(tf.global_variables_initializer())
  with self.session as sess:   
   step = 0
   accuracy_list = []
   # model saving
   save_path = os.path.abspath(os.path.join(os.path.curdir, 'models'))     
   if not os.path.exists(save_path):
    os.makedirs(save_path)   
   plt.ion()
   #new_state = sess.run(self.initial_state)
   new_state = sess.run(self.initial_state)
   batches = self.random_batches(data)
   for batch in batches:
    x,y = zip(*batch)
    x = np.array(x)
    y = np.array(y)
    print(len(x),len(y),step)
    step += 1
    start = time.time()
    if self.use_embeddings == False:
     feed = {self.inputs :x,
      self.targets:y,
      self.keep_prob : self.train_keep_prob,
      self.initial_state: new_state}
    else:
     feed = {self.inputs :x,
      self.targets:y,
      self.keep_prob : self.train_keep_prob,
      self.initial_state: new_state,
      self.embedding_ph: self.embedding_init}    
    batch_loss, new_state, batch_accuracy , _ = sess.run([self.cost,self.final_state,\
               self.accuracy, self.optimizer],feed_dict = feed)
    end = time.time()
    accuracy_list.append(batch_accuracy)
    # control the print lines
    if step%self.show_every_steps == 0:
     print('steps/epoch:{}/{}...'.format(step,self.num_epoches),
      'loss:{:.4f}...'.format(batch_loss),
      '{:.4f} sec/batch'.format((end - start)),
      'batch_Accuracy:{:.4f}...'.format(batch_accuracy)
      )
     plt.plot(accuracy_list)
     plt.pause(0.5)
    if step%self.save_every_steps == 0:
     self.saver.save(sess,os.path.join(save_path, 'model') ,global_step = step)
   self.saver.save(sess, os.path.join(save_path, 'model'), global_step=step) 

 def load_model(self, start_path=None):
  if start_path == None:
   model_path = os.path.abspath(os.path.join(os.path.curdir,"models"))
   ckpt = tf.train.get_checkpoint_state(model_path)
   path = ckpt.model_checkpoint_path
   print("this is the start path of model:",path)
   self.session = tf.Session()
   self.saver.restore(self.session, path)
   print("Restored model parameters is complete!")

  else:
   self.session = tf.Session()
   self.saver.restore(self.session,start_path)
   print("Restored model parameters is complete!")

 def predict_accuracy(self,data,test=True):
  # loading_model
  self.load_model()
  sess = self.session
  iterations = 0
  accuracy_list = []
  predictions = []
  epoch_temp = self.num_epoches
  self.num_epoches = 1
  batches = self.random_batches(data,shuffle=False)
  for batch in batches:
   iterations += 1
   x_inputs, y_inputs = zip(*batch)
   x_inputs = np.array(x_inputs)
   y_inputs = np.array(y_inputs)
   if self.use_embeddings == False:
    feed = {self.inputs: x_inputs,
      self.targets: y_inputs,
      self.keep_prob: 1.0}   
   else:
    feed = {self.inputs: x_inputs,
      self.targets: y_inputs,
      self.keep_prob: 1.0,
      self.embedding_ph: self.embedding_init}   
   to_train = [self.cost, self.final_state, self.predictions,self.accuracy]
   batch_loss,new_state,batch_pred,batch_accuracy = sess.run(to_train, feed_dict = feed)
   accuracy_list.append(np.mean(batch_accuracy))
   predictions.append(batch_pred)
   print('The trainning step is {0}'.format(iterations),\
     'trainning_accuracy: {:.3f}'.format(accuracy_list[-1]))    

  accuracy = np.mean(accuracy_list)
  predictions = [list(pred) for pred in predictions]
  predictions = [p for pred in predictions for p in pred]
  predictions = np.array(predictions)
  self.num_epoches = epoch_temp
  if test :
   return predictions, accuracy
  else:
   return accuracy    

 def predict(self, data):
  # load_model
  self.load_model()
  sess = self.session
  iterations = 0
  predictionss = []
  epoch_temp = self.num_epoches
  self.num_epoches = 1
  batches = self.random_batches(data)
  for batch in batches:
   x_inputs = batch
   if self.use_embeddings == False:
    feed = {self.inputs : x_inputs,
      self.keep_prob:1.0}
   else:
    feed = {self.inputs : x_inputs,
      self.keep_prob:1.0,
      self.embedding_ph: self.embedding_init}  
   batch_pred = sess.run([self.predictions],feed_dict=feed)
   predictions.append(batch_pred)

  predictions = [list(pred) for pred in predictions]
  predictions = [p for pred in predictions for p in pred]
  predictions = np.array(predictions) 
  return predictions

3、 进行模型数据的导入以及处理和模型训练,集中在一个处理文件中(sampling_trainning.py)
相应代码如下:

ps:在下面文档用用到glove的文档,这个可网上搜索进行相应的下载,下载后需要将glove对应的生成格式转化成word2vec对应的格式,就是在文件头步加入一行 两个整数(字典的数目和嵌入的特征长度),也可用python库自带的转化工具,网上进行相应使用方法的搜索便可。

import numpy as np
import os
import time
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import urllib.request
import zipfile
import lxml.etree
from collections import Counter
from random import shuffle
from gensim.models import KeyedVectors

# Download the dataset if it's not already there
if not os.path.isfile('ted_en-20160408.zip'):
 urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

# extract both the texts and the labels from the xml file
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
 doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
texts = doc.xpath('//content/text()')
labels = doc.xpath('//head/keywords/text()')
del doc

print("There are {} input texts, each a long string with text and punctuation.".format(len(texts)))
print("")
print(texts[0][:100])

# method remove unused words and labels
inputs_text = [ re.sub(r'\([^)]*\)',' ', text) for text in texts]
inputs_text = [re.sub(r':', ' ', text) for text in inputs_text]
#inputs_text = [text.split() for text in inputs_text]
print(inputs_text[0][0:100])

inputs_text = [ text.lower() for text in texts]
inputs_text = [ re.sub(r'([^a-z0-9\s])', r' <\1_token> ',text) for text in inputs_text]
#input_texts = [re.sub(r'([^a-z0-9\s])', r' <\1_token> ', input_text) for input_text in input_texts]
inputs_text = [text.split() for text in inputs_text]
print(inputs_text[0][0:100])

# label procession
label_lookup = ['ooo','Too','oEo','ooD','TEo','ToD','oED','TED']
new_label = []
for i in range(len(labels)):
 labels_pre = ['o','o','o']
 label = labels[i].split(', ')
 #print(label,i)
 if 'technology' in label:
  labels_pre[0] = 'T'
 if 'entertainment' in label:
  labels_pre[1] = 'E'
 if 'design' in label:
  labels_pre[2] = 'D'
 labels_temp = ''.join(labels_pre)
 label_index = label_lookup.index(labels_temp)
 new_label.append(label_index)

print('the length of labels:{0}'.format(len(new_label)))
print(new_label[0:50])
labels_index = np.zeros((len(new_label),8))
#for i in range(labels_index.shape[0]):
# labels_index[i,new_label[i]] = 1
labels_index[range(len(new_label)),new_label] = 1.0
print(labels_index[0:10])

# feature selections
unions = list(zip(inputs_text,labels_index))
unions = [union for union in unions if len(union[0]) >300]
print(len(unions))
inputs_text, labels_index = zip(*unions)
inputs_text = list(inputs_text)
labels = list(labels_index)
print(inputs_text[0][0:50])
print(labels_index[0:10])

# feature filttering

all_context = [word for text in inputs_text for word in text]
print('the present datas word is :{0}'.format(len(all_context)))
words_count = Counter(all_context)
most_words = [word for word, count in words_count.most_common(50)]
once_words = [word for word, count in words_count.most_common() if count == 1]
print('there {0} words only once to be removed'.format(len(once_words)))
print(most_words)
#print(once_words)
remove_words = set(most_words + once_words)
#print(remove_words)

inputs_new = [[word for word in text if word not in remove_words] for text in inputs_text]
new_all_counts =[word for text in inputs_new for word in text]
print('there new all context length is:{0}'.format(len(new_all_counts)))

# word2index and index2word processings
words_voca = set([word for text in inputs_new for word in text])
word2index = {}
index2word = {}
for i, word in enumerate(words_voca):
 word2index[word] = i
 index2word[i] = word
inputs_index = []
for text in inputs_new:
 inputs_index.append([word2index[word] for word in text])
print(len(inputs_index))
print(inputs_index[0][0:100])

model_glove = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False)

n_features = 300
embeddings = np.random.uniform(-0.1,0.1,(len(word2index),n_features))
inwords = 0
for word in words_voca:
 if word in model_glove.vocab:
  inwords += 1
  embeddings[word2index[word]] = model_glove[word]
print('there {} words in model_glove'.format(inwords))
print('The voca_word in presents text is:{0}'.format(len(words_voca)))
print('the precentage of words in glove is:{0}'.format(np.float(inwords)/len(words_voca)))

# truncate the sequence length
max_length = 1000
inputs_concat = []
for text in inputs_index:
 if len(text)>max_length:
  inputs_concat.append(text[0:max_length])
 else:
  inputs_concat.append(text + [0]*(max_length-len(text)))
print(len(inputs_concat))
inputs_index = inputs_concat
print(len(inputs_index))

# sampling the train data use category sampling
num_class = 8
label_unions = list(zip(inputs_index,labels_index))
print(len(label_unions))
trains = []
devs = []
tests = []
for c in range(num_class):
 type_sample = [union for union in label_unions if np.argmax(union[1]) == c]
 print('the length of this type length',len(type_sample),c)
 shuffle(type_sample)
 num_all = len(type_sample)
 num_train = int(num_all*0.8)
 num_dev = int(num_all*0.9)
 trains.extend(type_sample[0:num_train])
 devs.extend(type_sample[num_train:num_dev])
 tests.extend(type_sample[num_dev:num_all])
shuffle(trains)
shuffle(devs)
shuffle(tests)
print('the length of trains is:{0}'.format(len(trains)))
print('the length of devs is:{0}'.format(len(devs)))
print('the length of tests is:{0}'.format(len(tests)))


#--------------------------------------------------------------------
#------------------------ model processing --------------------------
#--------------------------------------------------------------------
from ClassifierRNN import NN_config,CALC_config,ClassifierRNN

# parameters used by rnns
num_layers = 1
num_units = 60
num_seqs = 1000
step_length = 10
num_steps = int(num_seqs/step_length)
embedding_size = 300
num_classes = 8
n_words = len(words_voca)

# parameters used by trainning models
batch_size = 64
num_epoch = 100
learning_rate = 0.0075
show_every_epoch = 10


nn_config = NN_config(num_seqs =num_seqs,\
      num_steps = num_steps,\
      num_units = num_units,\
     num_classes = num_classes,\
      num_layers = num_layers,\
      vocab_size = n_words,\
      embedding_size = embedding_size,\
      use_embeddings = False,\
      embedding_init = embeddings)
calc_config = CALC_config(batch_size = batch_size,\
       num_epoches = num_epoch,\
       learning_rate = learning_rate,\
       show_every_steps = 10,\
       save_every_steps = 100)

print("this is checking of nn_config:\\\n",
  "out of num_seqs:{}\n".format(nn_config.num_seqs),
  "out of num_steps:{}\n".format(nn_config.num_steps),
  "out of num_units:{}\n".format(nn_config.num_units),
 "out of num_classes:{}\n".format(nn_config.num_classes),
  "out of num_layers:{}\n".format(nn_config.num_layers),
  "out of vocab_size:{}\n".format(nn_config.vocab_size),
  "out of embedding_size:{}\n".format(nn_config.embedding_size),
  "out of use_embeddings:{}\n".format(nn_config.use_embeddings))
print("this is checing of calc_config: \\\n",
  "out of batch_size {} \n".format(calc_config.batch_size),
  "out of num_epoches {} \n".format(calc_config.num_epoches),
  "out of learning_rate {} \n".format(calc_config.learning_rate),
 "out of keep_prob {} \n".format(calc_config.keep_prob),
  "out of show_every_steps {} \n".format(calc_config.show_every_steps),
  "out of save_every_steps {} \n".format(calc_config.save_every_steps))

rnn_model = ClassifierRNN(nn_config,calc_config)
rnn_model.fit(trains,restart=False)
accuracy = rnn_model.predict_accuracy(devs,test=False)
print("Final accuracy of devs is {}".format(accuracy))
test_accuracy = rnn_model.predict_accuracy(tests,test=False)
print("The final accuracy of tests is :{}".format(test_accuracy))

4、模型评估, 因为在本次算例中模型数据较少,总共有2000多个样本,相对较少,因此难免出现过拟合的状态,rnn在训练trains样本时其准确率为接近1.0 但在进行devs和tests集合验证的时候,发现准确率为6.0左右,可适当的增加l2 但不在本算例考虑范围内,将本模型用于IMDB算例计算的时候,相抵25000个样本的时候的准确率为89.0%左右。

python使用RNN实现文本分类

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
python实现linux服务器批量修改密码并生成execl
Apr 22 Python
详解Golang 与python中的字符串反转
Jul 21 Python
Python基于time模块求程序运行时间的方法
Sep 18 Python
Python设计模式之中介模式简单示例
Jan 09 Python
深入了解Python中pop和remove的使用方法
Jan 09 Python
详解Python正则表达式re模块
Mar 19 Python
python 随机森林算法及其优化详解
Jul 11 Python
python 识别登录验证码图片功能的实现代码(完整代码)
Jul 03 Python
python中Django文件上传方法详解
Aug 05 Python
如何利用Python写个坦克大战
Nov 18 Python
python 如何把docker-compose.yaml导入到数据库相关条目里
Jan 15 Python
用Python爬取英雄联盟的皮肤详细示例
Dec 06 Python
Jupyter notebook远程访问服务器的方法
May 24 #Python
Python matplotlib 画图窗口显示到gui或者控制台的实例
May 24 #Python
解决python matplotlib imshow无法显示的问题
May 24 #Python
基于MTCNN/TensorFlow实现人脸检测
May 24 #Python
Jupyter中直接显示Matplotlib的图形方法
May 24 #Python
基于python神经卷积网络的人脸识别
May 24 #Python
在PyCharm环境中使用Jupyter Notebook的两种方法总结
May 24 #Python
You might like
解析smarty模板中类似for的功能实现
2013/06/18 PHP
解析二进制流接口应用实例 pack、unpack、ord 函数使用方法
2013/06/18 PHP
不用写JS也能使用EXTJS视频演示
2008/12/29 Javascript
javascript获取选中的文本的方法代码
2013/10/30 Javascript
优化javascript的执行效率一些方法总结
2013/12/25 Javascript
流量统计器如何鉴别C#:WebBrowser中伪造referer
2015/01/07 Javascript
使用javascript提交form表单方法汇总
2015/06/25 Javascript
详解JavaScript ES6中的Generator
2015/07/28 Javascript
JavaScript代码性能优化总结篇
2016/05/15 Javascript
浅谈js中字符和数组一些基本算法题
2016/08/15 Javascript
Bootstrap优化站点资源、响应式图片、传送带使用详解3
2016/10/14 Javascript
Vue.JS入门教程之处理表单
2016/12/01 Javascript
Vue.js计算属性computed与watch(5)
2016/12/09 Javascript
Omi v1.0.2发布正式支持传递javascript表达式
2017/03/21 Javascript
angularJS深拷贝详解
2017/03/23 Javascript
详解微信小程序 相对定位和绝对定位
2017/05/11 Javascript
Node.js开发第三方微信公众平台
2017/06/05 Javascript
vue深入解析之render function code详解
2017/07/18 Javascript
Layui弹框中数据表格中可双击选择一条数据的实现
2020/05/06 Javascript
微信小程序将页面按钮悬浮固定在底部的实现代码
2020/10/29 Javascript
Vue + ts实现轮播插件的示例
2020/11/10 Javascript
[01:53]DOTA2超级联赛专访Zhou 五年职业青春成长
2013/05/29 DOTA
python实现通过pil模块对图片格式进行转换的方法
2015/03/24 Python
Python中使用遍历在列表中添加字典遇到的坑
2019/02/27 Python
Python实现根据日期获取当天凌晨时间戳的方法示例
2019/04/09 Python
python3.5安装python3-tk详解
2019/04/26 Python
django框架auth模块用法实例详解
2019/12/10 Python
Python第三方包之DingDingBot钉钉机器人
2020/04/09 Python
Jupyter Notebook远程登录及密码设置操作
2020/04/10 Python
基于python图书馆管理系统设计实例详解
2020/08/05 Python
英国独特礼物想法和个性化礼物网站:notonthehighstreet.com
2018/04/16 全球购物
电焊工工作岗位职责
2014/02/06 职场文书
会计学自我鉴定
2014/02/06 职场文书
热爱劳动主题班会
2015/08/14 职场文书
创业计划之特色精品店
2019/08/12 职场文书
Go调用Rust方法及外部函数接口前置
2022/06/14 Golang