python中scikit-learn机器代码实例


Posted in Python onAugust 05, 2018

我们给大家带来了关于学习python中scikit-learn机器代码的相关具体实例,以下就是全部代码内容:

# -*- coding: utf-8 -*-
 
import numpy
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn import cross_validation
from sklearn import preprocessing
#import iris_data
 
def load_data():
  iris = load_iris()
  x, y = iris.data, iris.target
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
  return x_train,y_train,x_test,y_test
 
def train_clf3(train_data, train_tags):
  clf = LinearSVC(C=1100.0)#default with 'rbf' 
  clf.fit(train_data,train_tags)
  return clf
 
def train_clf(train_data, train_tags):
  clf = MultinomialNB(alpha=0.01)
  print numpy.asarray(train_tags)
  clf.fit(train_data, numpy.asarray(train_tags))
  return clf
 
def evaluate(actual, pred):
  m_precision = metrics.precision_score(actual, pred)
  m_recall = metrics.recall_score(actual, pred)
  print 'precision:{0:.3f}'.format(m_precision)
  print 'recall:{0:0.3f}'.format(m_recall)
  print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
 
x_train,y_train,x_test,y_test = load_data()
 
clf = train_clf(x_train, y_train)
 
pred = clf.predict(x_test)
evaluate(numpy.asarray(y_test), pred)
print metrics.classification_report(y_test, pred)
 
 
使用自定义数据
# coding: utf-8
 
import numpy
from sklearn import metrics
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import codecs
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import linear_model
 
train_corpus = [
   '我们 我们 好孩子 认证 。 就是',
   '我们 好孩子 认证 。 中国',
   '我们 好孩子 认证 。 孤独',
   '我们 好孩子 认证 。',
 ]
 
test_corpus = [
   '我 菲律宾 韩国',
   '我们 好孩子 认证 。 中国',
 ]
 
def input_data(train_file, test_file):
  train_words = []
  train_tags = []
  test_words = []
  test_tags = []
  f1 = codecs.open(train_file,'r','utf-8','ignore')
  for line in f1:
    tks = line.split(':', 1)
    word_list = tks[1]
    word_array = word_list[1:(len(word_list)-3)].split(", ")
    train_words.append(" ".join(word_array))
    train_tags.append(tks[0])
  f2 = codecs.open(test_file,'r','utf-8','ignore')
  for line in f2:
    tks = line.split(':', 1)
    word_list = tks[1]
    word_array = word_list[1:(len(word_list)-3)].split(", ")
    test_words.append(" ".join(word_array))
    test_tags.append(tks[0])
  return train_words, train_tags, test_words, test_tags
 
 
def vectorize(train_words, test_words):
  #v = HashingVectorizer(n_features=25000, non_negative=True)
  v = HashingVectorizer(non_negative=True)
  #v = CountVectorizer(min_df=1)
  train_data = v.fit_transform(train_words)
  test_data = v.fit_transform(test_words)
  return train_data, test_data
 
def vectorize1(train_words, test_words):
  tv = TfidfVectorizer(sublinear_tf = False,use_idf=True);
  train_data = tv.fit_transform(train_words);
  tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_);
  test_data = tv2.fit_transform(test_words);
  return train_data, test_data
  
def vectorize2(train_words, test_words):
  count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5); 
  counts_train = count_v1.fit_transform(train_words); 
   
  count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_);
  counts_test = count_v2.fit_transform(test_words);
   
  tfidftransformer = TfidfTransformer();
   
  train_data = tfidftransformer.fit(counts_train).transform(counts_train); 
  test_data = tfidftransformer.fit(counts_test).transform(counts_test);
  return train_data, test_data
 
def evaluate(actual, pred):
  m_precision = metrics.precision_score(actual, pred)
  m_recall = metrics.recall_score(actual, pred)
  print 'precision:{0:.3f}'.format(m_precision)
  print 'recall:{0:0.3f}'.format(m_recall)
  print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
 
 
def train_clf(train_data, train_tags):
  clf = MultinomialNB(alpha=0.01)
  clf.fit(train_data, numpy.asarray(train_tags))
  return clf
 
 
def train_clf1(train_data, train_tags):
  #KNN Classifier
  clf = KNeighborsClassifier()#default with k=5 
  clf.fit(train_data, numpy.asarray(train_tags)) 
  return clf
 
def train_clf2(train_data, train_tags):
  clf = linear_model.LogisticRegression(C=1e5) 
  clf.fit(train_data,train_tags)
  return clf
 
def train_clf3(train_data, train_tags):
  clf = LinearSVC(C=1100.0)#default with 'rbf' 
  clf.fit(train_data,train_tags)
  return clf
 
def train_clf4(train_data, train_tags):
  """
  随机森林,不可使用稀疏矩阵
  """
  clf = RandomForestClassifier(n_estimators=10)
  clf.fit(train_data.todense(),train_tags)
  return clf
 
#使用codecs逐行读取
def codecs_read_label_line(filename):
  label_list=[]
  f = codecs.open(filename,'r','utf-8','ignore')
  line = f.readline()
  while line:
    #label_list.append(line[0:len(line)-2])
    label_list.append(line[0:len(line)-1])
    line = f.readline()
  f.close()
  return label_list
 
def save_test_features(test_url, test_label):
  test_feature_list = codecs_read_label_line('test.dat')
  fw = open('test_labeded.dat',"w+")
  
  for (url,label) in zip(test_feature_list,test_label):
    fw.write(url+'\t'+label)
    fw.write('\n')
  fw.close()
 
def main():
  train_file = u'..\\file\\py_train.txt'
  test_file = u'..\\file\\py_test.txt'
  train_words, train_tags, test_words, test_tags = input_data(train_file, test_file)
  #print len(train_words), len(train_tags), len(test_words), len(test_words), 
  
  train_data, test_data = vectorize1(train_words, test_words)
  print type(train_data)
  print train_data.shape
  print test_data.shape
  print test_data[0].shape
  print numpy.asarray(test_data[0])
  
  clf = train_clf3(train_data, train_tags)
  
  scores = cross_validation.cross_val_score(
  clf, train_data, train_tags, cv=5, scoring="f1_weighted")
  print scores
 
  #predicted = cross_validation.cross_val_predict(clf, train_data,train_tags, cv=5)  
  '''
  
  '''
  pred = clf.predict(test_data)
  error_list=[]
  for (true_tag,predict_tag) in zip(test_tags,pred):
    if true_tag != predict_tag:
      print true_tag,predict_tag
      error_list.append(true_tag+' '+predict_tag)
  print len(error_list)
  evaluate(numpy.asarray(test_tags), pred)
  '''
  #输出打标签结果
  test_feature_list = codecs_read_label_line('test.dat')
  save_test_features(test_feature_list, pred)
  '''
  
 
if __name__ == '__main__':
  main()
Python 相关文章推荐
PHP webshell检查工具 python实现代码
Sep 15 Python
python 禁止函数修改列表的实现方法
Aug 03 Python
全面了解Nginx, WSGI, Flask之间的关系
Jan 09 Python
Python中生成器和迭代器的区别详解
Feb 10 Python
Python实现的维尼吉亚密码算法示例
Apr 12 Python
Python实现ping指定IP的示例
Jun 04 Python
Python拼接微信好友头像大图的实现方法
Aug 01 Python
对python读取zip压缩文件里面的csv数据实例详解
Feb 08 Python
numpy.where() 用法详解
May 27 Python
django序列化serializers过程解析
Dec 14 Python
Opencv python 图片生成视频的方法示例
Nov 18 Python
解决Pyinstaller打包软件失败的一个坑
Mar 04 Python
解决使用pycharm提交代码时冲突之后文件丢失找回的方法
Aug 05 #Python
Python字符串、整数、和浮点型数相互转换实例
Aug 04 #Python
python与caffe改变通道顺序的方法
Aug 04 #Python
Python爬虫PyQuery库基本用法入门教程
Aug 04 #Python
python list转矩阵的实例讲解
Aug 04 #Python
Python 生成 -1~1 之间的随机数矩阵方法
Aug 04 #Python
Python爬虫框架scrapy实现downloader_middleware设置proxy代理功能示例
Aug 04 #Python
You might like
PHP is_dir() 判断给定文件名是否是一个目录
2010/05/10 PHP
array_multisort实现PHP多维数组排序示例讲解
2011/01/04 PHP
php结合表单实现一些简单功能的例子
2011/06/04 PHP
destoon实现底部添加你是第几位访问者的方法
2014/07/15 PHP
YII中assets的使用示例
2014/07/31 PHP
php双层循环(九九乘法表)
2017/10/23 PHP
PHP 文件上传限制问题
2019/09/01 PHP
JavaScript Undefined,Null类型和NaN值区别
2008/10/22 Javascript
toString()一个会自动调用的方法
2010/02/08 Javascript
jquery 1.4.2发布!主要是性能与API
2010/02/25 Javascript
基于jquery实现拆分姓名的方法(纯JS版)
2013/05/08 Javascript
利用函数的惰性载入提高javascript代码执行效率
2014/05/05 Javascript
JQueryEasyUI之DataGrid数据显示
2016/11/23 Javascript
原生JS实现的多个彩色小球跟随鼠标移动动画效果示例
2018/02/01 Javascript
一份超级详细的Vue-cli3.0使用教程【推荐】
2018/11/15 Javascript
ES6 Class中实现私有属性的一些方法总结
2019/07/08 Javascript
关于layui时间回显问题的解决方法
2019/09/24 Javascript
[03:07]【DOTA2亚洲邀请赛】我们,梦开始的地方
2017/03/07 DOTA
Python函数可变参数定义及其参数传递方式实例详解
2015/05/25 Python
使用py2exe在Windows下将Python程序转为exe文件
2016/03/04 Python
Python2.7下安装Scrapy框架步骤教程
2017/12/22 Python
python实现pdf转换成word/txt纯文本文件
2018/06/07 Python
Python实现蒙特卡洛算法小实验过程详解
2019/07/12 Python
Python *args和**kwargs用法实例解析
2020/03/02 Python
基于CSS3 animation动画属性实现轮播图效果
2017/09/12 HTML / CSS
详解使用postMessage解决iframe跨域通信问题
2019/11/01 HTML / CSS
联想马亚西亚官方网站:Lenovo Malaysia
2018/09/19 全球购物
Python使用openpyxl复制整张sheet
2021/03/24 Python
服务之星获奖感言
2014/01/21 职场文书
党员2014两会学习心得体会
2014/03/17 职场文书
融资租赁计划书
2014/04/29 职场文书
本科毕业生自荐信
2014/05/26 职场文书
食品安全处置方案
2014/06/14 职场文书
党建工作整改措施
2014/10/28 职场文书
门面房租房协议书
2014/12/01 职场文书
Go中的条件语句Switch示例详解
2021/08/23 Golang