python中scikit-learn机器代码实例


Posted in Python onAugust 05, 2018

我们给大家带来了关于学习python中scikit-learn机器代码的相关具体实例,以下就是全部代码内容:

# -*- coding: utf-8 -*-
 
import numpy
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn import cross_validation
from sklearn import preprocessing
#import iris_data
 
def load_data():
  iris = load_iris()
  x, y = iris.data, iris.target
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
  return x_train,y_train,x_test,y_test
 
def train_clf3(train_data, train_tags):
  clf = LinearSVC(C=1100.0)#default with 'rbf' 
  clf.fit(train_data,train_tags)
  return clf
 
def train_clf(train_data, train_tags):
  clf = MultinomialNB(alpha=0.01)
  print numpy.asarray(train_tags)
  clf.fit(train_data, numpy.asarray(train_tags))
  return clf
 
def evaluate(actual, pred):
  m_precision = metrics.precision_score(actual, pred)
  m_recall = metrics.recall_score(actual, pred)
  print 'precision:{0:.3f}'.format(m_precision)
  print 'recall:{0:0.3f}'.format(m_recall)
  print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
 
x_train,y_train,x_test,y_test = load_data()
 
clf = train_clf(x_train, y_train)
 
pred = clf.predict(x_test)
evaluate(numpy.asarray(y_test), pred)
print metrics.classification_report(y_test, pred)
 
 
使用自定义数据
# coding: utf-8
 
import numpy
from sklearn import metrics
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import codecs
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import linear_model
 
train_corpus = [
   '我们 我们 好孩子 认证 。 就是',
   '我们 好孩子 认证 。 中国',
   '我们 好孩子 认证 。 孤独',
   '我们 好孩子 认证 。',
 ]
 
test_corpus = [
   '我 菲律宾 韩国',
   '我们 好孩子 认证 。 中国',
 ]
 
def input_data(train_file, test_file):
  train_words = []
  train_tags = []
  test_words = []
  test_tags = []
  f1 = codecs.open(train_file,'r','utf-8','ignore')
  for line in f1:
    tks = line.split(':', 1)
    word_list = tks[1]
    word_array = word_list[1:(len(word_list)-3)].split(", ")
    train_words.append(" ".join(word_array))
    train_tags.append(tks[0])
  f2 = codecs.open(test_file,'r','utf-8','ignore')
  for line in f2:
    tks = line.split(':', 1)
    word_list = tks[1]
    word_array = word_list[1:(len(word_list)-3)].split(", ")
    test_words.append(" ".join(word_array))
    test_tags.append(tks[0])
  return train_words, train_tags, test_words, test_tags
 
 
def vectorize(train_words, test_words):
  #v = HashingVectorizer(n_features=25000, non_negative=True)
  v = HashingVectorizer(non_negative=True)
  #v = CountVectorizer(min_df=1)
  train_data = v.fit_transform(train_words)
  test_data = v.fit_transform(test_words)
  return train_data, test_data
 
def vectorize1(train_words, test_words):
  tv = TfidfVectorizer(sublinear_tf = False,use_idf=True);
  train_data = tv.fit_transform(train_words);
  tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_);
  test_data = tv2.fit_transform(test_words);
  return train_data, test_data
  
def vectorize2(train_words, test_words):
  count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5); 
  counts_train = count_v1.fit_transform(train_words); 
   
  count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_);
  counts_test = count_v2.fit_transform(test_words);
   
  tfidftransformer = TfidfTransformer();
   
  train_data = tfidftransformer.fit(counts_train).transform(counts_train); 
  test_data = tfidftransformer.fit(counts_test).transform(counts_test);
  return train_data, test_data
 
def evaluate(actual, pred):
  m_precision = metrics.precision_score(actual, pred)
  m_recall = metrics.recall_score(actual, pred)
  print 'precision:{0:.3f}'.format(m_precision)
  print 'recall:{0:0.3f}'.format(m_recall)
  print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
 
 
def train_clf(train_data, train_tags):
  clf = MultinomialNB(alpha=0.01)
  clf.fit(train_data, numpy.asarray(train_tags))
  return clf
 
 
def train_clf1(train_data, train_tags):
  #KNN Classifier
  clf = KNeighborsClassifier()#default with k=5 
  clf.fit(train_data, numpy.asarray(train_tags)) 
  return clf
 
def train_clf2(train_data, train_tags):
  clf = linear_model.LogisticRegression(C=1e5) 
  clf.fit(train_data,train_tags)
  return clf
 
def train_clf3(train_data, train_tags):
  clf = LinearSVC(C=1100.0)#default with 'rbf' 
  clf.fit(train_data,train_tags)
  return clf
 
def train_clf4(train_data, train_tags):
  """
  随机森林,不可使用稀疏矩阵
  """
  clf = RandomForestClassifier(n_estimators=10)
  clf.fit(train_data.todense(),train_tags)
  return clf
 
#使用codecs逐行读取
def codecs_read_label_line(filename):
  label_list=[]
  f = codecs.open(filename,'r','utf-8','ignore')
  line = f.readline()
  while line:
    #label_list.append(line[0:len(line)-2])
    label_list.append(line[0:len(line)-1])
    line = f.readline()
  f.close()
  return label_list
 
def save_test_features(test_url, test_label):
  test_feature_list = codecs_read_label_line('test.dat')
  fw = open('test_labeded.dat',"w+")
  
  for (url,label) in zip(test_feature_list,test_label):
    fw.write(url+'\t'+label)
    fw.write('\n')
  fw.close()
 
def main():
  train_file = u'..\\file\\py_train.txt'
  test_file = u'..\\file\\py_test.txt'
  train_words, train_tags, test_words, test_tags = input_data(train_file, test_file)
  #print len(train_words), len(train_tags), len(test_words), len(test_words), 
  
  train_data, test_data = vectorize1(train_words, test_words)
  print type(train_data)
  print train_data.shape
  print test_data.shape
  print test_data[0].shape
  print numpy.asarray(test_data[0])
  
  clf = train_clf3(train_data, train_tags)
  
  scores = cross_validation.cross_val_score(
  clf, train_data, train_tags, cv=5, scoring="f1_weighted")
  print scores
 
  #predicted = cross_validation.cross_val_predict(clf, train_data,train_tags, cv=5)  
  '''
  
  '''
  pred = clf.predict(test_data)
  error_list=[]
  for (true_tag,predict_tag) in zip(test_tags,pred):
    if true_tag != predict_tag:
      print true_tag,predict_tag
      error_list.append(true_tag+' '+predict_tag)
  print len(error_list)
  evaluate(numpy.asarray(test_tags), pred)
  '''
  #输出打标签结果
  test_feature_list = codecs_read_label_line('test.dat')
  save_test_features(test_feature_list, pred)
  '''
  
 
if __name__ == '__main__':
  main()
Python 相关文章推荐
Python ORM框架SQLAlchemy学习笔记之安装和简单查询实例
Jun 10 Python
Python2.6版本中实现字典推导 PEP 274(Dict Comprehensions)
Apr 28 Python
Django实现自定义404,500页面教程
Mar 26 Python
Python实现爬取百度贴吧帖子所有楼层图片的爬虫示例
Apr 26 Python
python dataframe 输出结果整行显示的方法
Jun 14 Python
python3.6使用tkinter实现弹跳小球游戏
May 09 Python
Python用Try语句捕获异常的实例方法
Jun 26 Python
Python imageio读取视频并进行编解码详解
Dec 10 Python
python关于调用函数外的变量实例
Dec 26 Python
tensorflow 只恢复部分模型参数的实例
Jan 06 Python
python数据分析工具之 matplotlib详解
Apr 09 Python
python 安装移动复制第三方库操作
Jul 13 Python
解决使用pycharm提交代码时冲突之后文件丢失找回的方法
Aug 05 #Python
Python字符串、整数、和浮点型数相互转换实例
Aug 04 #Python
python与caffe改变通道顺序的方法
Aug 04 #Python
Python爬虫PyQuery库基本用法入门教程
Aug 04 #Python
python list转矩阵的实例讲解
Aug 04 #Python
Python 生成 -1~1 之间的随机数矩阵方法
Aug 04 #Python
Python爬虫框架scrapy实现downloader_middleware设置proxy代理功能示例
Aug 04 #Python
You might like
PHP系统流量分析的程序
2006/10/09 PHP
MySQL 日期时间函数常用总结
2012/06/12 PHP
深入PHP nl2br()格式化输出的详解
2013/06/05 PHP
Javascript与vbscript数据共享
2007/01/09 Javascript
jquery星级插件、支持页面中多次使用
2012/03/25 Javascript
JS 获取滚动条高度示例代码
2013/10/24 Javascript
javascript实现yield的方法
2013/11/06 Javascript
浅谈JS闭包中的循环绑定处理程序
2014/11/09 Javascript
Jquery中巧用Ajax的beforeSend方法
2016/01/20 Javascript
详解JavaScript异步编程中jQuery的promise对象的作用
2016/05/03 Javascript
Vue.js路由组件vue-router使用方法详解
2016/12/02 Javascript
AngularJS ng-repeat指令中使用track by子语句解决重复数据遍历错误问题
2017/01/21 Javascript
关于TypeScript中import JSON的正确姿势详解
2017/07/25 Javascript
微信小程序实现image组件图片自适应宽度比例显示的方法
2018/01/16 Javascript
Vue源码探究之状态初始化
2018/11/14 Javascript
如何实现双向绑定mvvm的原理实现
2019/05/28 Javascript
vue服务端渲染操作简单入门实例分析
2019/08/28 Javascript
JS前端面试必备——基本排序算法原理与实现方法详解【插入/选择/归并/冒泡/快速排序】
2020/02/24 Javascript
javascript实现简单搜索功能
2020/03/26 Javascript
解决element-ui的下拉框有值却无法选中的情况
2020/11/07 Javascript
[39:08]完美世界DOTA2联赛PWL S3 LBZS vs CPG 第一场 12.12
2020/12/16 DOTA
Python 调用DLL操作抄表机
2009/01/12 Python
python条件和循环的使用方法
2013/11/01 Python
python使用urlparse分析网址中域名的方法
2015/04/15 Python
在Django中进行用户注册和邮箱验证的方法
2016/05/09 Python
Python  Django 母版和继承解析
2019/08/09 Python
解决jupyter运行pyqt代码内核重启的问题
2020/04/16 Python
Python 如何定义匿名或内联函数
2020/08/01 Python
详解如何在PyCharm控制台中输出彩色文字和背景
2020/08/17 Python
浅谈html5 video 移动端填坑记
2018/01/15 HTML / CSS
英国文具、办公用品和科技商店:Ryman
2018/09/27 全球购物
说明书格式及范文
2014/05/07 职场文书
学前教育专业求职信
2014/09/02 职场文书
学年个人总结范文
2015/03/05 职场文书
高校自主招生校长推荐信
2015/03/23 职场文书
爱心捐赠活动简讯
2015/07/20 职场文书