Python实现大文件排序的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python 相关文章推荐
Python高级应用实例对比:高效计算大文件中的最长行的长度
Jun 08 Python
Python中使用logging模块代替print(logging简明指南)
Jul 09 Python
python时间日期函数与利用pandas进行时间序列处理详解
Mar 13 Python
Python结合ImageMagick实现多张图片合并为一个pdf文件的方法
Apr 24 Python
对PyTorch torch.stack的实例讲解
Jul 30 Python
python循环输出三角形图案的例子
Nov 22 Python
python3的UnicodeDecodeError解决方法
Dec 20 Python
基于Keras中Conv1D和Conv2D的区别说明
Jun 19 Python
Python Flask异步发送邮件实现方法解析
Aug 01 Python
Python Charles抓包配置实现流程图解
Sep 29 Python
Pytorch 如何实现常用正则化
May 27 Python
如何正确理解python装饰器
Jun 15 Python
Python实现telnet服务器的方法
Jul 10 #Python
Python读写unicode文件的方法
Jul 10 #Python
Python实现提取谷歌音乐搜索结果的方法
Jul 10 #Python
python和bash统计CPU利用率的方法
Jul 10 #Python
Python多线程下载文件的方法
Jul 10 #Python
Python爬取国外天气预报网站的方法
Jul 10 #Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
You might like
php生成短网址示例
2014/05/05 PHP
ThinkPHP模板判断输出Defined标签用法详解
2014/06/30 PHP
destoon调用discuz论坛中带图片帖子的实现方法
2014/08/21 PHP
phpstorm编辑器乱码问题解决
2014/12/01 PHP
PHP实现获取中英文首字母
2015/06/19 PHP
在javascript将NodeList作为Array数组处理的方法
2010/07/09 Javascript
js确定对象类型方法
2012/03/30 Javascript
借助script进行Http跨域请求:JSONP实现原理及代码
2013/03/19 Javascript
关于事件mouseover ,mouseout ,mouseenter,mouseleave的区别
2015/10/12 Javascript
图片旋转、鼠标滚轮缩放、镜像、切换图片js代码
2020/12/13 Javascript
jQuery控制文本框只能输入数字和字母及使用方法
2016/05/26 Javascript
jquery自动补齐功能插件flexselect用法示例
2016/08/06 Javascript
Vue.js每天必学之过渡与动画
2016/09/06 Javascript
Angularjs 实现一个幻灯片示例代码
2016/09/08 Javascript
给easyui datebox扩展一个清空的实例
2016/11/09 Javascript
js中开关变量使用实例
2017/02/24 Javascript
jQuery使用EasyUi实现三级联动下拉框效果
2017/03/08 Javascript
ES6 Promise对象概念与用法分析
2017/04/01 Javascript
vue-cli整合vuex的时候,修改actions和mutations,实现热部署的方法
2018/09/19 Javascript
浅谈layui 表单元素的选中问题
2019/10/25 Javascript
vue实现输入框自动跳转功能
2020/05/20 Javascript
JavaScript中while循环的基础使用教程
2020/08/11 Javascript
python实现定制交互式命令行的方法
2014/07/03 Python
在Python的Django框架的视图中使用Session的方法
2015/07/23 Python
pytorch 实现删除tensor中的指定行列
2020/01/13 Python
Python3与fastdfs分布式文件系统如何实现交互
2020/06/23 Python
Python进行统计建模
2020/08/10 Python
Python数据可视化常用4大绘图库原理详解
2020/10/23 Python
Python 爬取淘宝商品信息栏目的实现
2021/02/06 Python
canvas里面如何基于随机点绘制一个多边形的方法
2018/06/13 HTML / CSS
GOLFINO英国官网:高尔夫服装
2020/04/11 全球购物
大学生预备党员自我评价
2015/03/04 职场文书
小学开学典礼新闻稿
2015/07/17 职场文书
2016教师给学生的毕业寄语
2015/12/04 职场文书
2019最新版股权转让及委托持股协议书范本
2019/08/07 职场文书
Mysql中有关Datetime和Timestamp的使用总结
2021/12/06 MySQL