Python实现大文件排序的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python 相关文章推荐
python写xml文件的操作实例
Oct 05 Python
Python下Fabric的简单部署方法
Jul 14 Python
详解tensorflow实现迁移学习实例
Feb 10 Python
python使用Pycharm创建一个Django项目
Mar 05 Python
Django使用Celery异步任务队列的使用
Mar 13 Python
Python实现的文本对比报告生成工具示例
May 22 Python
Python将8位的图片转为24位的图片实现方法
Oct 24 Python
Python图像处理之gif动态图的解析与合成操作详解
Dec 30 Python
PyTorch中常用的激活函数的方法示例
Aug 20 Python
python 表格打印代码实例解析
Oct 12 Python
python之MSE、MAE、RMSE的使用
Feb 24 Python
Django ORM filter() 的运用详解
May 14 Python
Python实现telnet服务器的方法
Jul 10 #Python
Python读写unicode文件的方法
Jul 10 #Python
Python实现提取谷歌音乐搜索结果的方法
Jul 10 #Python
python和bash统计CPU利用率的方法
Jul 10 #Python
Python多线程下载文件的方法
Jul 10 #Python
Python爬取国外天气预报网站的方法
Jul 10 #Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
You might like
PHP学习笔记之一
2011/01/17 PHP
php ci框架中加载css和js文件失败的解决方法
2014/03/03 PHP
php遍历类中包含的所有元素的方法
2015/05/12 PHP
CodeIgniter与PHP5.6的兼容问题
2015/07/16 PHP
利用switch语句进行多选一判断的实例代码
2016/11/14 PHP
PHP基于array_unique实现二维数组去重
2020/07/14 PHP
javascript 动态添加事件代码
2008/11/30 Javascript
javascript实现在某个元素上阻止鼠标右键事件的方法和实例
2014/08/12 Javascript
javascript弹出拖动窗口
2015/08/11 Javascript
JavaScript实现图片轮播组件代码示例
2016/11/22 Javascript
js中开关变量使用实例
2017/02/24 Javascript
js监听html页面的上下滚动事件方法
2018/09/11 Javascript
一篇不错的Python入门教程
2007/02/08 Python
python简单猜数游戏实例
2015/07/09 Python
关于Python元祖,列表,字典,集合的比较
2017/01/06 Python
浅谈python函数之作用域(python3.5)
2017/10/27 Python
解决pycharm 安装numpy失败的问题
2019/12/05 Python
通过实例解析Python return运行原理
2020/03/04 Python
解决Python中报错TypeError: must be str, not bytes问题
2020/04/07 Python
CSS3等相关属性制作分页导航实现代码
2012/12/24 HTML / CSS
viagogo波兰票务平台:演唱会、体育比赛、戏剧门票
2018/04/23 全球购物
国际花店:Pickup Flowers
2020/04/10 全球购物
linux系统都有哪些运行级别
2016/03/26 面试题
商场中秋节广播稿
2014/01/17 职场文书
高中生期末评语
2014/01/28 职场文书
致共产党员倡议书
2014/04/16 职场文书
3的组成教学反思
2014/04/30 职场文书
英文求职信范文
2014/05/23 职场文书
个人查摆剖析材料
2014/10/04 职场文书
工厂门卫岗位职责
2015/04/13 职场文书
小学毕业感言100字
2015/07/30 职场文书
小学英语教师研修感悟
2015/11/18 职场文书
掌握这项技巧,一年阅读300本书不是梦
2019/09/12 职场文书
MySQL时间设置注意事项的深入总结
2021/05/06 MySQL
经典《舰娘》游改全新动画预告 预定11月开播
2022/04/01 日漫
Win11如何启用启动修复 ? Win11执行启动修复的三种方法
2022/04/08 数码科技