Python实现大文件排序的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python 相关文章推荐
Python3 正在毁灭 Python的原因分析
Nov 28 Python
Python发送email的3种方法
Apr 28 Python
python保存字符串到文件的方法
Jul 01 Python
基于Python Numpy的数组array和矩阵matrix详解
Apr 04 Python
在pycharm上mongodb配置及可视化设置方法
Nov 30 Python
使用Python创建简单的HTTP服务器的方法步骤
Apr 26 Python
python 画二维、三维点之间的线段实现方法
Jul 07 Python
python 读取修改pcap包的例子
Jul 23 Python
利用python list完成最简单的DB连接池方法
Aug 09 Python
调用其他python脚本文件里面的类和方法过程解析
Nov 15 Python
Python必备技巧之函数的使用详解
Apr 04 Python
Python OpenGL基本配置方式
May 20 Python
Python实现telnet服务器的方法
Jul 10 #Python
Python读写unicode文件的方法
Jul 10 #Python
Python实现提取谷歌音乐搜索结果的方法
Jul 10 #Python
python和bash统计CPU利用率的方法
Jul 10 #Python
Python多线程下载文件的方法
Jul 10 #Python
Python爬取国外天气预报网站的方法
Jul 10 #Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
You might like
解析smarty模板中类似for的功能实现
2013/06/18 PHP
PHP树的深度编历生成迷宫及A*自动寻路算法实例分析
2015/03/10 PHP
Yii2框架引用bootstrap中日期插件yii2-date-picker的方法
2016/01/09 PHP
PHP图片加水印实现方法
2016/05/06 PHP
Zend Framework过滤器Zend_Filter用法详解
2016/12/09 PHP
PHP中include()与require()的区别说明
2017/02/14 PHP
基于PHP实现堆排序原理及实例详解
2020/06/19 PHP
js实时监听文本框状态的方法
2011/04/26 Javascript
Jquery ajaxStart()与ajaxStop()方法(实例讲解)
2013/12/18 Javascript
Node.js中require的工作原理浅析
2014/06/24 Javascript
js css 实现遮罩层覆盖其他页面元素附图
2014/09/22 Javascript
JS实现类似百叶窗下拉菜单效果
2016/12/30 Javascript
关于ES6的六个小特性(二)
2017/02/20 Javascript
JavaScript简介_动力节点Java学院整理
2017/06/26 Javascript
Vue项目部署的实现(阿里云+Nginx代理+PM2)
2019/03/26 Javascript
vue中的 $slot 获取插槽的节点实例
2019/11/12 Javascript
python中getaddrinfo()基本用法实例分析
2015/06/28 Python
python 获取字符串MD5值方法
2018/05/29 Python
对python3 Serial 串口助手的接收读取数据方法详解
2019/06/12 Python
python2和python3实现在图片上加汉字的方法
2019/08/22 Python
CSS3中利用animation属性创建雪花飘落特效
2014/05/14 HTML / CSS
丝芙兰加拿大官方网站:SEPHORA加拿大
2018/11/20 全球购物
南京某公司笔试题
2013/01/27 面试题
Java语言程序设计测试题改错题部分
2014/07/22 面试题
高中自我评价范文
2014/01/27 职场文书
特色蛋糕店创业计划书
2014/01/28 职场文书
《宋庆龄故居的樟树》教学反思
2014/04/07 职场文书
我的长征观后感
2015/06/09 职场文书
微观世界观后感
2015/06/10 职场文书
幼儿园小班开学寄语(2016秋季)
2015/12/03 职场文书
2016年小学“公民道德宣传日”活动总结
2016/04/01 职场文书
解决Go gorm踩过的坑
2021/04/30 Golang
原生JS实现飞机大战小游戏
2021/06/09 Javascript
mysql 直接拷贝data 目录下文件还原数据的实现
2021/07/25 MySQL
Pygame如何使用精灵和碰撞检测
2021/11/17 Python
无线电知识基础入门篇
2022/02/18 无线电